uniprop 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,542 @@
1
+ module UniPropUtils
2
+ class DownloaderWrapper
3
+ # UNICODE_PUBLIC = "https://sw.it.aoyama.ac.jp/2022/sakaida/UCD/"
4
+ UNICODE_PUBLIC = "https://www.unicode.org/Public/"
5
+
6
+ class << self
7
+ # 現在公開されているバージョンの名前をすべて取得
8
+ # @return [Array<String>]
9
+ def get_version_names
10
+ doc = Nokogiri::HTML(URI.open(UNICODE_PUBLIC))
11
+
12
+ version_names = []
13
+ doc.css('tr td a').each do |a|
14
+ begin
15
+ version_name = a.content[..-2]
16
+ UniProp::Version.parse(version_name)
17
+ version_names << version_name
18
+ rescue UniProp::ParseError
19
+ end
20
+ end
21
+
22
+ version_names
23
+ end
24
+
25
+ # URLを指定して1つのUnicodeファイルをダウンロード
26
+ # @note ベータ版ファイルをDownloader::downloadでダウンロードすると、ファイル名が異なる場合に古いファイルが削除されない。その問題を回避するため、ファイル名をprefixのみを使用してダウンロードするメソッド
27
+ # @param [Pathname] url ダウンロードするファイルのURLの絶対パス
28
+ # @param [Pathname] cache_dir_path ダウンロードしたファイルを保存するディレクトリの絶対パス。この下の階層に15.0.0などのバージョン名を表すディレクトリが作成される
29
+ def unicode_download(url, cache_dir_path, unicode_beta: false, since: true)
30
+ relative_url = url.relative_path_from(UNICODE_PUBLIC) # バージョン名より下の階層のパス
31
+ file_cache_path = Pathname.new(cache_dir_path)+relative_url.parent
32
+
33
+ options = {cache_dir: false}
34
+ if unicode_beta
35
+ options[:unicode_beta] = "YES"
36
+ end
37
+
38
+ Downloader.download(UNICODE_PUBLIC+relative_url.to_s, FileManager.prefix_path(relative_url).to_s, dir=file_cache_path.to_s, since=since, options=options)
39
+ end
40
+
41
+ # version_nameに該当するバージョンのファイルをダウンロードする
42
+ # @param [String] version_name
43
+ # @param [Pathname] cache_dir_path
44
+ # @param [Array<String>] excluded_extensions ダウンロードしない拡張子
45
+ # @param [Array<String>] excluded_directories ダウンロードしないディレクトリの名前。excluded_directoriesに名前が含まれるディレクトリより下の階層にあるファイルは、ダウンロード対象から除外される
46
+ # @param [Array<String>] excluded_files ダウンロードしないファイルの名前
47
+ # @param [Array<String>] included_files excluded系引数に除外されている場合でも、included_filesに名前が含まれるファイルはダウンロードされる
48
+ # @param [Boolean] unicode_beta ダウンロード対象がベータ版ならtrue
49
+ def download_version(version_name, cache_dir_path, excluded_extensions, excluded_directories, excluded_files, included_files, unicode_beta: false, since: true)
50
+ file_urls = FileManager.filter_file(files_in_version(version_name), excluded_extensions, excluded_directories, excluded_files, included_files)
51
+
52
+ file_urls.each { unicode_download(_1, cache_dir_path, unicode_beta: unicode_beta, since: since) }
53
+ end
54
+
55
+ # version_nameで指定したバージョンに含まれるファイルのパスを取得
56
+ # @param [String] version_name
57
+ # @return [Array<Pathname>] URLのPathnameのArray
58
+ def files_in_version(version_name)
59
+ UniProp::Version.parse(version_name)
60
+
61
+ version_path = Pathname.new(UNICODE_PUBLIC) + Pathname.new(version_name)
62
+
63
+ files_in(version_path.to_s)
64
+ end
65
+
66
+ # urlよりも下の階層にあるファイルのURLをPathnameオブジェクトで主t九
67
+ # @param [String] url URLの絶対パスを表す文字列
68
+ # @return [Array<Pathname>]
69
+ def files_in(url)
70
+ doc = Nokogiri::HTML(URI.open(url))
71
+
72
+ files = []
73
+ doc.css('tr td a').each do |a|
74
+ if a.keys.include?("href") && !a['href'].start_with?("/")
75
+ if a['href'].end_with?("/")
76
+ child_dir_path = Pathname.new(url) + Pathname.new(a['href'])
77
+
78
+ files.concat(files_in(child_dir_path.to_s))
79
+ else
80
+ files << Pathname.new(url) + Pathname.new(a.content)
81
+ end
82
+ end
83
+ end
84
+ files
85
+ end
86
+
87
+ # prefixがbasename_prefixと一致するファイルを取得
88
+ # @param [String] basename_prefix
89
+ # @param [String] version_name
90
+ # @return [Pathname]
91
+ def find_file_path(basename_prefix, version_name)
92
+ files_in_version(version_name)
93
+ # 例えば4.1.0にはPropList.txtとPropList.htmlの両方が存在
94
+ # txtとzipのみを検索に使用
95
+ .filter { ["txt", "zip"].include?(FileManager.ext_no_dot(_1)) }
96
+ .find { UniProp::Alias.canonical(FileManager.prefix(_1)) == UniProp::Alias.canonical(basename_prefix) }
97
+ end
98
+
99
+ # basename_prefixとversion_nameでファイルを指定してダウンロード
100
+ def unicode_basename_download(basename_prefix, version_name, cache_dir_path, unicode_beta: false, since: true)
101
+ path = find_file_path(basename_prefix, version_name)
102
+
103
+ if path
104
+ unicode_download(path, cache_dir_path, unicode_beta: unicode_beta, since: since)
105
+ else
106
+ raise(UniProp::FileNotFoundError, "#{basename_prefix} is not found in #{version_name}")
107
+ end
108
+ end
109
+
110
+ # Unihan.zipをダウンロード
111
+ def download_unihan(version_name, cache_dir_path, unicode_beta: false, since: true)
112
+ unicode_basename_download("unihan", version_name, cache_dir_path, unicode_beta: unicode_beta, since: since)
113
+ end
114
+ end
115
+ end
116
+
117
+ class TypeJudgementer
118
+ RE_SINGLE_CODEPOINT = /^[0-9A-Fa-f]{4,6}$/
119
+ RE_RANGE_CODEPOINT = /^[0-9A-Fa-f]{4,6}\.\.[0-9A-Fa-f]{4,6}$/
120
+ # NFKC_CaseFoldの00ADのように、空文字列を値に持つStringプロパティも存在するため、空文字列もRE_STRINGにマッチするような実装にしてある
121
+ RE_STRING = /^([0-9A-Fa-f]{4,6}\s*)*$/
122
+ RE_NUMERIC = /
123
+ ^-?\d{1,}$| # integer
124
+ ^-?\d{1,}.\d{1,}$| # float
125
+ ^-?\d{1,}\/\d{1,}$ # rational
126
+ /x
127
+ RE_BINARY = /^Yes$|^Y$|^No$|^N$/
128
+
129
+
130
+ class << self
131
+ # @param [String] str
132
+ def validate_single_codepoint(str)
133
+ str.match?(RE_SINGLE_CODEPOINT)
134
+ end
135
+
136
+ # @param [String] str
137
+ def validate_range_codepoint(str)
138
+ str.match?(RE_RANGE_CODEPOINT)
139
+ end
140
+
141
+ # @param [String] str
142
+ def validate_codepoint(str)
143
+ str.match?(RE_SINGLE_CODEPOINT) || str.match?(RE_RANGE_CODEPOINT)
144
+ end
145
+
146
+ # @param [String] str
147
+ def validate_numeric(str)
148
+ str.match?(RE_NUMERIC)
149
+ end
150
+
151
+ # @param [String] str
152
+ def validate_string(str)
153
+ str.match?(RE_STRING)
154
+ end
155
+
156
+ # @param [String] str
157
+ # @param [Property] property
158
+ def validate_binary(str, property)
159
+ str.match?(RE_BINARY) || property.has_alias?(str)
160
+ end
161
+
162
+ # @param [String] str
163
+ # @param [Property] property
164
+ def validate_enumerative(str, property)
165
+ property.has_property_value?(str)
166
+ end
167
+
168
+ def validate_codepoints(array, threshold)
169
+ return (array.filter{validate_codepoint(_1) }.size.to_f / array.size) > threshold
170
+ end
171
+
172
+ def validate_numerics(array, threshold)
173
+ return (array.filter{validate_numeric(_1) }.size.to_f / array.size) > threshold
174
+ end
175
+
176
+ def validate_strings(array, threshold)
177
+ return (array.filter{validate_string(_1) }.size.to_f / array.size) > threshold
178
+ end
179
+
180
+ def validate_binaries(array, properties, threshold)
181
+ return (array.filter{validate_binary(_1, properties) }.size.to_f / array.size) > threshold
182
+ end
183
+
184
+ def validate_binaries_for_property(array, property, threshold)
185
+ return (array.filter{validate_binary_for_property(_1, property) }.size.to_f / array.size) > threshold
186
+ end
187
+ end
188
+ end
189
+
190
+ class FileRegexp
191
+ class << self
192
+ def matched_positions(text, regexp)
193
+ mp = []
194
+ m = text.match(regexp)
195
+
196
+ col_cnt = 0
197
+ while m
198
+ position = {}
199
+
200
+ position[:match_data] = m
201
+ col_cnt += m.pre_match.count("\n")
202
+ position[:begin_col] = col_cnt
203
+ col_cnt += m[0].count("\n")
204
+ position[:end_col] = col_cnt
205
+ position[:begin_point] = m.begin(0) - m.pre_match.rindex("\n").to_i() - 1
206
+ position[:end_point] = position[:begin_point] + m[0].size - m[0].rindex("\n").to_i() -1
207
+
208
+ mp << position
209
+ text = m.post_match
210
+ m = text.match(regexp)
211
+ end
212
+
213
+ mp
214
+ end
215
+ end
216
+ end
217
+
218
+ class FileManager
219
+ class << self
220
+ def filter_file(files, excluded_extensions, excluded_directories, excluded_files, included_files)
221
+ files = files.dup
222
+ original_files = files.dup
223
+ excluded_extensions = excluded_extensions.map { _1.downcase }
224
+ excluded_files = excluded_files.map { _1.downcase }
225
+ included_files = included_files.map { _1.downcase }
226
+
227
+ # remove files by excluded_extensions
228
+ files = files.reject { excluded_extensions.include? ext_no_dot(downcase_path(_1)).downcase }
229
+
230
+ # remove files by excluded_directories
231
+ excluded_directories.each do |dir|
232
+ files = files.reject { child?(dir, downcase_path(_1)) }
233
+ end
234
+
235
+ # remove files by excluded_files
236
+ files = files.reject { excluded_files.include? prefix(downcase_path(_1)) }
237
+
238
+ # remove test files
239
+ files = files.reject { prefix(_1).end_with? "Test" }
240
+
241
+ # add files by included_files
242
+ original_files.each do |ori_f|
243
+ included_files.each do |inc_f|
244
+ if (prefix(ori_f).downcase==inc_f || ori_f.basename.to_s.downcase==inc_f) && !files.include?(ori_f)
245
+ files << ori_f
246
+ end
247
+ end
248
+ end
249
+
250
+ files
251
+ end
252
+
253
+ def child?(parent, child)
254
+ if parent.class == String
255
+ child.descend.any? { _1.basename.to_s == parent }
256
+ elsif parent.class == Pathname
257
+ downcase_path(child).to_s.include? downcase_path(parent).to_s
258
+ end
259
+ end
260
+
261
+ # pathの-数字より前の文字列を取得
262
+ # @param [Pathname] path
263
+ # @return [String]
264
+ def prefix(path)
265
+ path = Pathname.new(path)
266
+ m = basename_no_ext(path).match(/^([\.\-0-9a-zA-Z_ ]+)-([\.0-9a-zA-Z_ ]+)$/)
267
+
268
+ if m
269
+ before_hyphen = m[1]
270
+ after_hyphen = m[2]
271
+
272
+ if after_hyphen.start_with?(/[0-9]/)
273
+ return before_hyphen
274
+ else
275
+ return m[0]
276
+ end
277
+ else
278
+ return basename_no_ext(path)
279
+ end
280
+ end
281
+
282
+ # pathのbasename部分をprefixのみに変更したPathnameを取得
283
+ # @param [Pathname] path
284
+ # @return [Pathname]
285
+ def prefix_path(path)
286
+ path.parent + Pathname.new(prefix(path) + path.extname)
287
+ end
288
+
289
+ def downcase_path(path)
290
+ Pathname.new(path.cleanpath.to_s.split("/").map { _1.downcase }.join("/")).cleanpath
291
+ end
292
+
293
+ def ext_no_dot(path)
294
+ path = Pathname.new(path)
295
+ ext = path.extname
296
+ if ext.empty?
297
+ return ""
298
+ else
299
+ return ext[1..]
300
+ end
301
+ end
302
+
303
+ def basename_no_ext(path)
304
+ name = path.basename.to_s
305
+ name.slice(0..(name.size-path.extname.size-1))
306
+ end
307
+
308
+ # pathsの中に含まれるzipファイルを全て展開
309
+ # @param [Iterable<Pathname>] paths
310
+ # @return [Boolean] 1つ以上のファイルが展開された場合true
311
+ def unzip(paths)
312
+ unzipped_f = false # 返り値用フラグ
313
+
314
+ paths.each do |path|
315
+ if ext_no_dot(path).downcase == "zip"
316
+
317
+ # dir/hoge.zipを展開した場合、dir/unzipped/hoge に保存
318
+ unzipped_cache_path = path.parent+Pathname.new("unzipped")+Pathname.new(prefix(path))
319
+
320
+ FileUtils.mkdir(unzipped_cache_path.parent) if !unzipped_cache_path.parent.exist?
321
+
322
+ # 既に展開済みファイルが存在する場合、展開処理は行わない
323
+ if unzipped_cache_path.exist?
324
+ break
325
+ else
326
+ FileUtils.mkdir(unzipped_cache_path)
327
+ end
328
+
329
+ Zip::File.open(path) do |zip_file|
330
+ zip_file.each do |entry|
331
+ zip_file.extract(entry, unzipped_cache_path+Pathname.new(entry.name))
332
+ end
333
+ end
334
+ unzipped_f = true
335
+ end
336
+ end
337
+
338
+ unzipped_f
339
+ end
340
+
341
+ # pathsの中に含まれるzipファイルを全て展開。zipファイルの中にzipファイルがある場合には再帰的に展開。
342
+ # @param [Iterable<Pathname>] paths
343
+ def recursive_unzip(paths)
344
+ loop do
345
+ return if !unzip(paths)
346
+ end
347
+ end
348
+
349
+ # pathがUnihanのファイルかを判定
350
+ # @note unihan_file_namesでUnihanのファイル名を指定できる。nilの場合、Unihan*のワイルドカードが使用される。
351
+ # @param [Pathname/String] file ファイルのパスまたはbasename_prefixに相当する文字列
352
+ # @param [Array<String>] unihan_file_names
353
+ def unihan_file?(file, unihan_file_names=nil)
354
+ if file.class==Pathname
355
+ file = prefix(file)
356
+ end
357
+ file = UniProp::Alias.canonical(file)
358
+
359
+ if unihan_file_names
360
+ unihan_file_names = unihan_file_names.map { UniProp::Alias.canonical(_1) }
361
+
362
+ return unihan_file_names.include?(file)
363
+ else
364
+ return file.match?(/unihan.*/)
365
+ end
366
+ end
367
+ end
368
+ end
369
+
370
+ class RangeProcessor
371
+ class << self
372
+ # rangesに含まれるRangeオブジェクトを結合した結果を含むArrayを取得
373
+ # @param [Array<Range>] ranges
374
+ # @return [Array<Range>]
375
+ def sum_up(ranges)
376
+ scattered_ranges = []
377
+ ranges.each do |range|
378
+ if range.class==Range
379
+ scattered_ranges << range.to_a
380
+ elsif range.class==Integer
381
+ scattered_ranges << range
382
+ end
383
+ end
384
+
385
+ array_to_ranges(scattered_ranges.flatten)
386
+ end
387
+
388
+ def sub(range_array1, range_array2)
389
+ range_array1 = sum_up(range_array1)
390
+ range_array2 = sum_up(range_array2)
391
+
392
+ array1 = (range_array1.map { _1.to_a }).flatten
393
+ array2 = (range_array2.map { _1.to_a }).flatten
394
+
395
+ non_dup_array = (Set.new(array1) - Set.new(array2)).to_a
396
+ array_to_ranges(non_dup_array)
397
+ end
398
+
399
+ # @param [Array<Integer>] array
400
+ # @return [Array<Range<Integer>>]
401
+ def array_to_ranges(array)
402
+ array = array.uniq.sort << Float::INFINITY
403
+
404
+ ranges = []
405
+
406
+ pre_elm = nil
407
+ begin_elm = nil
408
+
409
+ array.each do |elm|
410
+ if !pre_elm
411
+ pre_elm = elm
412
+ begin_elm = elm
413
+ elsif elm != pre_elm+1
414
+ ranges << Range.new(begin_elm, pre_elm)
415
+ begin_elm = elm
416
+ end
417
+ pre_elm = elm
418
+ end
419
+ ranges
420
+ end
421
+
422
+ def intersection(range_array)
423
+ if range_array.size==0
424
+ return nil
425
+ else
426
+ common_set = range_array[0].to_set
427
+ end
428
+
429
+ range_array.each do |range|
430
+ common_set &= range.to_set
431
+ end
432
+
433
+ array_to_ranges(common_set.to_a)[0]
434
+ end
435
+
436
+ def intersections_between_range_arrays(*range_arrays)
437
+ common_set_of_range_arrays = nil
438
+
439
+ range_arrays.each do |range_array|
440
+ set_of_range_array = Set.new
441
+ range_array.each { set_of_range_array.merge(_1.to_set) }
442
+
443
+ if common_set_of_range_arrays
444
+ common_set_of_range_arrays &= set_of_range_array
445
+ else
446
+ common_set_of_range_arrays = set_of_range_array
447
+ end
448
+ end
449
+
450
+ array_to_ranges(common_set_of_range_arrays.to_a)
451
+ end
452
+
453
+ # array内のRangeのいずれかに含まれるIntegerのうち、最小のものを取得
454
+ # @param [Array<Range<Integer>>] array
455
+ # @return [Integer?] arrayが空の場合nil
456
+ def min(array)
457
+ array.min_by { _1.min }&.min
458
+ end
459
+
460
+ # array内のRangeのいずれかに含まれるIntegerのうち、最大のものを取得
461
+ # @param [Array<Range<Integer>>] array
462
+ # @return [Integer?] arrayが空の場合nil
463
+ def max(array)
464
+ array.max_by { _1.max }&.max
465
+ end
466
+
467
+ # rangeを最小がlower_limit、最大がupper_limitの範囲内で切って返す(範囲の外部を切る)。切った結果、範囲が残らない場合、nilを返す
468
+ # @note 残す範囲にlower_limit, upper_limitも含まれる
469
+ # @param [Range<Integer>?] range
470
+ # @param [Integer] lower_limit
471
+ # @param [Integer] upper_limit
472
+ # @return [Range<Integer>?]
473
+ def cut_external(range, lower_limit, upper_limit)
474
+ return nil if range.max<lower_limit || upper_limit<range.min
475
+
476
+ result_min = (range.min<lower_limit) ? lower_limit : range.min
477
+ result_max = (upper_limit<range.max) ? upper_limit : range.max
478
+ result_min..result_max
479
+ end
480
+
481
+ # rangeを最小がlower_limit、最大がupper_limitの範囲内になるよう切って返す(範囲の内部を切る)。切った結果、範囲が残らない場合、nilを返す
482
+ # @note 残す範囲にlower_limit, upper_limitは含まれない
483
+ # @param [Array<Range<Integer>>] range
484
+ # @param [Integer] lower_limit
485
+ # @param [Integer] upper_limit
486
+ # @return [Range<Integer>?]
487
+ def cut_internal(range, lower_limit, upper_limit)
488
+ inner_range = cut_external((lower_limit..upper_limit), range.min, range.max)
489
+ return [range] if !inner_range
490
+
491
+ result = []
492
+ if range.min < inner_range.min
493
+ result << (range.min .. (inner_range.min-1))
494
+ end
495
+ if inner_range.max < range.max
496
+ result << ((inner_range.max+1) .. range.max)
497
+ end
498
+
499
+ result
500
+ end
501
+
502
+ # a..b形式のstrをRange<Integer>に変換
503
+ # @note strはRange<Integer>を表している必要がある
504
+ # @param [String] str
505
+ # @return [Range<Integer>]
506
+ # @raise [ConvertError] strがRange<Integer>を表していない場合発生
507
+ def str_to_range(str)
508
+ m = str.match(/^(\d+)\.\.(\d+)$/)
509
+ if m
510
+ return Range.new(m[1].to_i, m[2].to_i)
511
+ else
512
+ raise ConvertError, "Argument must be parsed as Range of Integer"
513
+ end
514
+ end
515
+ end
516
+ end
517
+
518
+ class CodepointConverter
519
+ class << self
520
+ # String型のcodepointをIntegerを使用したオブジェクトに変換
521
+ # @param [String] codepoint_str
522
+ # @return [Range<Integer,Integer>/Integer] 返る値はcodepoint_strの形式による
523
+ def str_to_int(codepoint_str)
524
+ if TypeJudgementer.validate_range_codepoint(codepoint_str)
525
+ m = codepoint_str.match(/^([0-9A-Fa-f]{4,6})\.\.([0-9A-Fa-f]{4,6})$/)
526
+
527
+ begin_codepoint = m[1]
528
+ end_codepoint = m[2]
529
+
530
+ return Range.new(begin_codepoint.hex, end_codepoint.hex)
531
+
532
+ elsif TypeJudgementer.validate_single_codepoint(codepoint_str)
533
+ return codepoint_str.hex
534
+ else
535
+ raise(ConvertError, "#{codepoint_str} is not a codepoint")
536
+ end
537
+ end
538
+ end
539
+ end
540
+
541
+ class ConvertError < StandardError; end
542
+ end