lucarecord 0.2.25 → 0.2.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1982b60b00eddc3d201368a2f436835fa51c6ba58538de3fcd0ce962c4529246
4
- data.tar.gz: 894ef1778f5a8be1f2091575ec86e84ceb61d0d158ba0cb4b8bc297c7070593a
3
+ metadata.gz: 526cf95ca548f5f2b4617f0af76337fa1d9a05896b212a4468f2d68db4b3ab03
4
+ data.tar.gz: 203299e63b9835ca1da312059df927cc8d65d81bea25d299fd08cf1d5afca0fa
5
5
  SHA512:
6
- metadata.gz: 396bfbd54619361753b576fdda9f715b115ab274df10dce07abb1a5f180ed161946c72bae02c4926ff05d828e20af6920c689a412c71234c5baef072f870b528
7
- data.tar.gz: 633d8920f547c893b2b1ac6645bdd108439d3568cfa3ec738c70ec61f1f658374e598bce0144a5a85142a15e4353d102e35b48ea84988f4dae06b0658a3952f7
6
+ metadata.gz: d53ff1db376c14b4b39f2975dc70ec0cb74a04dc25f7b17da193c532b819f76614399ad019e4101d73acde439bfdef7a0cc6773a8bacda88b0bc22dedeb8c5e8
7
+ data.tar.gz: 4feed97a4a7b5f7505f6aee51a71c7301606387f3ce2c1bf7086edf1c3bb5e3ceab42b43e7a5daa2576e94ba4ae8e054a3d5ec88d735b5ff47c903ee0a31290a
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## LucaRecord 0.2.26
2
+
3
+ * Support #dig / #search for TSV dictionary
4
+ * Fix: shorten n-gram split factor on search word length < specified factor
5
+
1
6
  ## LucaRecord 0.2.25
2
7
 
3
8
  * Implement `dir_digest()` for data validation.
@@ -17,10 +17,11 @@ module LucaRecord
17
17
  set_driver
18
18
  end
19
19
 
20
- # Search word with n-gram.
20
+ # Search code with n-gram word.
21
21
  # If dictionary has Hash or Array, it returns [label, options].
22
22
  #
23
23
  def search(word, default_word = nil, main_key: 'label', options: nil)
24
+ definitions_lazyload
24
25
  res, score = max_score_code(word.gsub(/[[:space:]]/, ''))
25
26
  return default_word if score < 0.4
26
27
 
@@ -34,6 +35,12 @@ module LucaRecord
34
35
  end
35
36
  end
36
37
 
38
+ # Search with unique code.
39
+ #
40
+ def dig(*args)
41
+ @data.dig(*args)
42
+ end
43
+
37
44
  # Separate main item from other options.
38
45
  # If options specified as Array of string, it works as safe list filter.
39
46
  #
@@ -49,7 +56,6 @@ module LucaRecord
49
56
  [obj[main_key], options.compact]
50
57
  end
51
58
 
52
- #
53
59
  # Load CSV with config options
54
60
  #
55
61
  def load_csv(path)
@@ -58,7 +64,6 @@ module LucaRecord
58
64
  end
59
65
  end
60
66
 
61
- #
62
67
  # load dictionary data
63
68
  #
64
69
  def self.load(file = @filename)
@@ -72,7 +77,6 @@ module LucaRecord
72
77
  end
73
78
  end
74
79
 
75
- #
76
80
  # generate dictionary from TSV file. Minimum assumption is as bellows:
77
81
  # 1st row is converted symbol.
78
82
  #
@@ -101,7 +105,7 @@ module LucaRecord
101
105
  puts 'No error detected.'
102
106
  nil
103
107
  else
104
- "Key #{errors.join(', ')} has nil #{target_key}."
108
+ puts "Key #{errors.join(', ')} has nil #{target_key}."
105
109
  errors.count
106
110
  end
107
111
  end
@@ -109,9 +113,15 @@ module LucaRecord
109
113
  private
110
114
 
111
115
  def set_driver
112
- input = self.class.load(@path)
113
- @config = input['config']
114
- @definitions = input['definitions']
116
+ @data = self.class.load(@path)
117
+ @config = @data['config']
118
+ @definitions = @data['definitions']
119
+ end
120
+
121
+ # Build Reverse dictionary for TSV data
122
+ #
123
+ def definitions_lazyload
124
+ @definitions ||= @data.each_with_object({}) { |(k, entry), h| h[entry[:label]] = k if entry[:label] }
115
125
  end
116
126
 
117
127
  def self.dict_path(filename)
@@ -124,7 +134,7 @@ module LucaRecord
124
134
 
125
135
  def max_score_code(str)
126
136
  res = @definitions.map do |k, v|
127
- [v, match_score(str, k, 3)]
137
+ [v, match_score(str, k, 2)]
128
138
  end
129
139
  res.max { |x, y| x[1] <=> y[1] }
130
140
  end
@@ -311,6 +311,14 @@ module LucaRecord # :nodoc:
311
311
  File.open(subpath, mode) { |f| yield(f, id_set) }
312
312
  end
313
313
  end
314
+
315
+ # Calculate md5sum with original digest, file content and filename(optional).
316
+ #
317
+ def update_digest(digest, str, filename = nil)
318
+ str = filename.nil? ? str : filename + str
319
+ content = Digest::MD5.new.update(str).hexdigest
320
+ Digest::MD5.new.update(digest + content).hexdigest
321
+ end
314
322
  end
315
323
 
316
324
  # git object like structure
@@ -433,13 +441,5 @@ module LucaRecord # :nodoc:
433
441
  {}
434
442
  end
435
443
  end
436
-
437
- # Calculate md5sum with original digest, file content and filename(optional).
438
- #
439
- def update_digest(digest, str, filename = nil)
440
- str = filename.nil? ? str : filename + str
441
- content = Digest::MD5.new.update(str).hexdigest
442
- Digest::MD5.new.update(digest + content).hexdigest
443
- end
444
444
  end
445
445
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module LucaRecord
4
- VERSION = '0.2.25'
4
+ VERSION = '0.2.26'
5
5
  end
@@ -132,8 +132,9 @@ module LucaSupport
132
132
  end
133
133
 
134
134
  def match_score(a, b, n = 2)
135
- v_a = to_ngram(a, n)
136
- v_b = to_ngram(b, n)
135
+ split_factor = [a.length, b.length, n].min
136
+ v_a = to_ngram(a, split_factor)
137
+ v_b = to_ngram(b, split_factor)
137
138
 
138
139
  v_a.map { |item| v_b.include?(item) ? 1 : 0 }.sum / v_a.length.to_f
139
140
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lucarecord
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.25
4
+ version: 0.2.26
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chuma Takahiro
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-03-13 00:00:00.000000000 Z
11
+ date: 2021-03-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mail