eco-helpers 2.0.15 → 2.0.21

Sign up to get free protection for your applications and to get access to all the features.
Files changed (97) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +109 -3
  3. data/eco-helpers.gemspec +11 -5
  4. data/lib/eco-helpers.rb +2 -0
  5. data/lib/eco/api/common/base_loader.rb +14 -0
  6. data/lib/eco/api/common/loaders/parser.rb +1 -0
  7. data/lib/eco/api/common/people/default_parsers/date_parser.rb +11 -1
  8. data/lib/eco/api/common/people/default_parsers/login_providers_parser.rb +1 -1
  9. data/lib/eco/api/common/people/default_parsers/policy_groups_parser.rb +11 -11
  10. data/lib/eco/api/common/people/entries.rb +1 -0
  11. data/lib/eco/api/common/people/entry_factory.rb +74 -23
  12. data/lib/eco/api/common/people/person_entry.rb +5 -2
  13. data/lib/eco/api/common/people/supervisor_helpers.rb +27 -0
  14. data/lib/eco/api/common/session.rb +1 -0
  15. data/lib/eco/api/common/session/base_session.rb +2 -0
  16. data/lib/eco/api/common/session/file_manager.rb +2 -2
  17. data/lib/eco/api/common/session/helpers.rb +30 -0
  18. data/lib/eco/api/common/session/helpers/prompt_user.rb +34 -0
  19. data/lib/eco/api/common/session/mailer.rb +0 -1
  20. data/lib/eco/api/common/session/s3_uploader.rb +0 -1
  21. data/lib/eco/api/common/session/sftp.rb +0 -1
  22. data/lib/eco/api/common/version_patches/ecoportal_api/external_person.rb +1 -1
  23. data/lib/eco/api/common/version_patches/ecoportal_api/internal_person.rb +7 -4
  24. data/lib/eco/api/common/version_patches/exception.rb +11 -4
  25. data/lib/eco/api/microcases.rb +3 -1
  26. data/lib/eco/api/microcases/append_usergroups.rb +0 -1
  27. data/lib/eco/api/microcases/people_cache.rb +2 -2
  28. data/lib/eco/api/microcases/people_load.rb +2 -2
  29. data/lib/eco/api/microcases/people_refresh.rb +2 -2
  30. data/lib/eco/api/microcases/people_search.rb +6 -6
  31. data/lib/eco/api/microcases/preserve_default_tag.rb +23 -0
  32. data/lib/eco/api/microcases/preserve_filter_tags.rb +28 -0
  33. data/lib/eco/api/microcases/preserve_policy_groups.rb +30 -0
  34. data/lib/eco/api/microcases/set_account.rb +0 -1
  35. data/lib/eco/api/microcases/with_each.rb +67 -6
  36. data/lib/eco/api/microcases/with_each_present.rb +4 -2
  37. data/lib/eco/api/microcases/with_each_starter.rb +4 -2
  38. data/lib/eco/api/organization.rb +1 -0
  39. data/lib/eco/api/organization/people.rb +98 -22
  40. data/lib/eco/api/organization/people_similarity.rb +272 -0
  41. data/lib/eco/api/organization/person_schemas.rb +5 -1
  42. data/lib/eco/api/organization/policy_groups.rb +5 -1
  43. data/lib/eco/api/organization/presets_factory.rb +40 -80
  44. data/lib/eco/api/organization/presets_integrity.json +6 -0
  45. data/lib/eco/api/organization/presets_values.json +5 -4
  46. data/lib/eco/api/organization/tag_tree.rb +33 -0
  47. data/lib/eco/api/policies/default_policies/99_user_access_policy.rb +0 -30
  48. data/lib/eco/api/session.rb +10 -24
  49. data/lib/eco/api/session/batch.rb +25 -7
  50. data/lib/eco/api/session/config.rb +16 -15
  51. data/lib/eco/api/session/config/api.rb +4 -0
  52. data/lib/eco/api/session/config/apis.rb +80 -0
  53. data/lib/eco/api/session/config/files.rb +7 -0
  54. data/lib/eco/api/session/config/people.rb +3 -19
  55. data/lib/eco/api/usecases/default_cases.rb +4 -1
  56. data/lib/eco/api/usecases/default_cases/abstract_policygroup_abilities_case.rb +161 -0
  57. data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +223 -0
  58. data/lib/eco/api/usecases/default_cases/clean_unknown_tags_case.rb +37 -0
  59. data/lib/eco/api/usecases/default_cases/codes_to_tags_case.rb +2 -3
  60. data/lib/eco/api/usecases/default_cases/reset_landing_page_case.rb +11 -1
  61. data/lib/eco/api/usecases/default_cases/restore_db_case.rb +1 -2
  62. data/lib/eco/api/usecases/default_cases/supers_cyclic_identify_case.rb +72 -0
  63. data/lib/eco/api/usecases/default_cases/supers_hierarchy_case.rb +1 -1
  64. data/lib/eco/api/usecases/default_cases/to_csv_case.rb +132 -29
  65. data/lib/eco/api/usecases/default_cases/to_csv_detailed_case.rb +61 -36
  66. data/lib/eco/api/usecases/ooze_samples/ooze_update_case.rb +3 -2
  67. data/lib/eco/cli.rb +0 -10
  68. data/lib/eco/cli/config/default/options.rb +48 -17
  69. data/lib/eco/cli/config/default/people.rb +18 -24
  70. data/lib/eco/cli/config/default/people_filters.rb +3 -3
  71. data/lib/eco/cli/config/default/usecases.rb +105 -28
  72. data/lib/eco/cli/config/default/workflow.rb +21 -12
  73. data/lib/eco/cli/config/help.rb +1 -0
  74. data/lib/eco/cli/config/options_set.rb +106 -13
  75. data/lib/eco/cli/config/use_cases.rb +33 -33
  76. data/lib/eco/cli/scripting/args_helpers.rb +30 -3
  77. data/lib/eco/csv.rb +4 -2
  78. data/lib/eco/csv/table.rb +121 -21
  79. data/lib/eco/data.rb +1 -0
  80. data/lib/eco/data/crypto/encryption.rb +3 -3
  81. data/lib/eco/data/files/directory.rb +28 -20
  82. data/lib/eco/data/files/helpers.rb +6 -4
  83. data/lib/eco/data/fuzzy_match.rb +201 -0
  84. data/lib/eco/data/fuzzy_match/array_helpers.rb +75 -0
  85. data/lib/eco/data/fuzzy_match/chars_position_score.rb +38 -0
  86. data/lib/eco/data/fuzzy_match/ngrams_score.rb +82 -0
  87. data/lib/eco/data/fuzzy_match/pairing.rb +95 -0
  88. data/lib/eco/data/fuzzy_match/result.rb +87 -0
  89. data/lib/eco/data/fuzzy_match/results.rb +77 -0
  90. data/lib/eco/data/fuzzy_match/score.rb +49 -0
  91. data/lib/eco/data/fuzzy_match/stop_words.rb +35 -0
  92. data/lib/eco/data/fuzzy_match/string_helpers.rb +82 -0
  93. data/lib/eco/version.rb +1 -1
  94. metadata +168 -11
  95. data/lib/eco/api/microcases/refresh_abilities.rb +0 -19
  96. data/lib/eco/api/organization/presets_reference.json +0 -59
  97. data/lib/eco/api/usecases/default_cases/refresh_abilities_case.rb +0 -30
data/lib/eco/data.rb CHANGED
@@ -6,3 +6,4 @@ end
6
6
  require_relative 'data/crypto'
7
7
  require_relative 'data/files'
8
8
  require_relative 'data/mapper'
9
+ require_relative 'data/fuzzy_match'
@@ -1,7 +1,7 @@
1
1
  require 'openssl'
2
- require 'json'
2
+ # 'json'
3
3
  require 'base64'
4
- require 'pp'
4
+ #require 'pp'
5
5
 
6
6
  require_relative '../../cli/scripting'
7
7
 
@@ -147,7 +147,7 @@ module Eco
147
147
  return str_c
148
148
  #EncryptedData.new({content: str_c, key: key, iv: iv})
149
149
 
150
-
150
+
151
151
  end
152
152
  def aes256_decrypt(data, key: , iv: , block_octets: BLOCK_OCTETS)
153
153
  block_bits = block_bits * 8
@@ -5,6 +5,29 @@ module Eco
5
5
  module Files
6
6
  class Directory
7
7
 
8
+ class << self
9
+ def create(path, includes_file: false)
10
+ return true if Files.file_exists?(path)
11
+
12
+ parts = Files.split(File.expand_path(path))
13
+ filename = parts.pop if includes_file
14
+
15
+ return true if Files.dir_exists?(File.join(*parts))
16
+
17
+ subpath = nil
18
+ begin
19
+ parts.each do |curr|
20
+ subpath = subpath ? File.join(subpath, curr) : curr
21
+ Dir.mkdir(subpath) unless Files.dir_exists?(subpath)
22
+ end
23
+ rescue Exception => e
24
+ pp e
25
+ return false
26
+ end
27
+ true
28
+ end
29
+ end
30
+
8
31
  attr_reader :dir_path
9
32
 
10
33
  def initialize(dir_path = Dir.pwd)
@@ -14,12 +37,14 @@ module Eco
14
37
  end
15
38
 
16
39
  def exists?
17
- Files.dir_exists(@dir_path)
40
+ Files.dir_exists?(@dir_path)
18
41
  end
19
42
 
20
43
  def create
21
- succeed = Directory.create(File.expand_path(@dir_path)) unless self.exists?
22
- self.full_path if succeed
44
+ return self.full_path if self.exists?
45
+ if succeed = Directory.create(File.expand_path(@dir_path))
46
+ return self.full_path
47
+ end
23
48
  end
24
49
 
25
50
  def full_path
@@ -57,23 +82,6 @@ module Eco
57
82
  File.join(*args)
58
83
  end
59
84
 
60
- def self.create(path, includes_file: false)
61
- return true if Files.file_exists?(path)
62
- parts = Files.split(File.expand_path(path))
63
- filename = parts.pop if includes_file
64
- return true if Files.dir_exists?(File.join(*parts))
65
- subpath = nil
66
- begin
67
- parts.each do |curr|
68
- subpath = subpath ? File.join(subpath, curr) : curr
69
- Dir.mkdir(subpath) unless Files.dir_exists?(subpath)
70
- end
71
- rescue Exception => e
72
- pp e
73
- end
74
- false
75
- end
76
-
77
85
  private
78
86
 
79
87
  def file_pattern(value)
@@ -3,11 +3,13 @@ module Eco
3
3
  module Files
4
4
  DEFAULT_TIMESTAMP_PATTERN = '%Y-%m-%dT%H%M%S'
5
5
 
6
- def self.included(base)
7
- base.send(:include, InstanceMethods)
8
- base.extend(ClassMethods)
6
+ class << self
7
+ def included(base)
8
+ base.send(:include, InstanceMethods)
9
+ base.extend(ClassMethods)
10
+ end
9
11
  end
10
-
12
+
11
13
  module InstanceMethods
12
14
 
13
15
  end
@@ -0,0 +1,201 @@
1
+ require 'fuzzy_match'
2
+ require 'amatch'
3
+ require 'jaro_winkler'
4
+
5
+ require_relative 'fuzzy_match/stop_words'
6
+ require_relative 'fuzzy_match/array_helpers'
7
+ require_relative 'fuzzy_match/string_helpers'
8
+ require_relative 'fuzzy_match/pairing'
9
+ require_relative 'fuzzy_match/chars_position_score'
10
+ require_relative 'fuzzy_match/ngrams_score'
11
+
12
+ module Eco
13
+ module Data
14
+ module FuzzyMatch
15
+
16
+ class << self
17
+ def included(base)
18
+ base.send(:include, InstanceMethods)
19
+ base.extend(ClassMethods)
20
+ end
21
+ end
22
+
23
+ module ClassMethods
24
+ include ArrayHelpers
25
+ include StringHelpers
26
+ include Pairing
27
+ include CharsPositionScore
28
+ include NGramsScore
29
+
30
+ def jaro_winkler(str1, str2, **options)
31
+ return 0 if !str1 || !str2
32
+ options = {
33
+ ignore_case: true,
34
+ weight: 0.25
35
+ }.merge(options)
36
+ JaroWinkler.distance(str1, str2, **options)
37
+ end
38
+
39
+ end
40
+
41
+ module InstanceMethods
42
+ FUZZY_MATCH_OPTIONS = [
43
+ :identities, :groupings, :stop_words, :read,
44
+ :must_match_grouping, :must_match_at_least_one_word,
45
+ :gather_last_result, :threshold
46
+ ]
47
+
48
+ JARO_OPTIONS = [:ignore_case, :weight]
49
+ NGRAMS_OPTIONS = [:range]
50
+ POSITION_OPTIONS = [:max_distance]
51
+ RESULTS_OPTIONS = [:order, :threshold]
52
+
53
+ include StopWords
54
+
55
+ attr_accessor :fuzzy_options
56
+
57
+ def fuzzy_options
58
+ @fuzzy_options ||= {}
59
+ end
60
+
61
+ def fuzzy_match(haystack_data = nil, **options)
62
+ if instance_variable_defined?(:@fuzzy_match) && !haystack_data
63
+ return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options)
64
+ end
65
+ @fuzzy_options = options
66
+ # make it run with a native C extension (for better performance: ~130 % increase of performance)
67
+ ::FuzzyMatch.engine = :amatch
68
+ @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
69
+ end
70
+
71
+ # TODO: integration for options[:unique_words] => to ensure repeated words do not bring down the score are cut by threshold
72
+ # @note
73
+ # - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results
74
+ # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key.
75
+ # @param needle_str [String, nil] the actual value of needle_str to be used.
76
+ # @param haystack [Enumerable] the items to find `needle` among.
77
+ # @return [Eco::Data::FuzzyMatch::Results]
78
+ def find_all_with_score(needle, needle_str: nil, haystack: nil, **options)
79
+ base_match = fuzzy_match(haystack, **options)
80
+ match_results = base_match.find_all_with_score(needle_str || needle)
81
+ needle_str ||= item_string(needle)
82
+ results = match_results.each_with_object([]) do |fuzzy_results, results|
83
+ item, dice, lev = fuzzy_results
84
+ unless item == needle
85
+ item_str = item_string(item)
86
+
87
+ if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty?
88
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0
89
+ end
90
+
91
+ jaro_res ||= jaro(needle_str, item_str)
92
+ ngram_res ||= ngram(needle_str, item_str)
93
+ wngram_res ||= words_ngram(needle_str, item_str)
94
+ pos_res ||= position(needle_str, item_str)
95
+
96
+ results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
97
+ end
98
+ end
99
+ Results.new(needle, needle_str, results).tap do |res|
100
+ res.order = fuzzy_options[:order] if fuzzy_options[:order]
101
+ res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
102
+ end.relevant_results
103
+ end
104
+
105
+ def recalculate_results(results, needle_str: nil, **options)
106
+ raise "You should provide a block |needle_str, item_str, needle, item|" unless block_given?
107
+ new_results = results.each_with_object([]) do |result, new_results|
108
+ nstr, istr = yield(needle_str || results.value, result.value, results.needle, result.match)
109
+
110
+ if istr.to_s.strip.empty?
111
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 1
112
+ elsif nstr.to_s.strip.empty?
113
+ unless istr = needle_str
114
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0
115
+ end
116
+ end
117
+
118
+ res = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev
119
+ dice ||= res&.dices_coefficient_similar || 0
120
+ lev ||= res&.levenshtein_similar || 0
121
+ jaro_res ||= jaro(nstr, istr)
122
+ ngram_res ||= ngram(nstr, istr)
123
+ wngram_res ||= words_ngram(nstr, istr)
124
+ pos_res ||= position(nstr, istr)
125
+
126
+ new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
127
+ end
128
+ Results.new(results.needle, results.value, new_results).tap do |res|
129
+ res.order = options[:order] if options[:order]
130
+ res.threshold = options[:threshold] if options[:threshold]
131
+ end.relevant_results
132
+ end
133
+
134
+ private
135
+
136
+ def jaro(str1, str2)
137
+ options = fuzzy_options.slice(*JARO_OPTIONS)
138
+ self.class.jaro_winkler(str1, str2, **options)
139
+ end
140
+
141
+ def ngram(str1, str2)
142
+ options = { range: 3..5 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
143
+ self.class.ngrams_score(str1, str2, **options).ratio
144
+ end
145
+
146
+ def words_ngram(str1, str2)
147
+ options = { range: 3..7 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
148
+ self.class.words_ngrams_score(str1, str2, **options).ratio
149
+ end
150
+
151
+ def position(str1, str2)
152
+ options = fuzzy_options.slice(*POSITION_OPTIONS)
153
+ self.class.chars_position_score(str1, str2, **options).ratio
154
+ end
155
+
156
+ # @note
157
+ # - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash`
158
+ # @param data [Enumerable, nil]
159
+ # @return [Array<Object>] the non-repeated values of `data`
160
+ def haystack(data = nil)
161
+ data = self if self.is_a?(Enumerable) && !data
162
+ raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable)
163
+ data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten
164
+ data.uniq.compact.tap do |items|
165
+ if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
166
+ raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
167
+ end
168
+ end
169
+ end
170
+
171
+ def item_string(item, attr = fuzzy_read_method)
172
+ return item if !item || item.is_a?(String) || !attr
173
+ return attr.call(item) if attr.is_a?(Proc)
174
+ attr = attr.to_sym
175
+ return item.send(attr) if item.respond_to?(attr)
176
+ end
177
+
178
+ def fuzzy_match_options(options = nil)
179
+ options = fuzzy_options unless options
180
+ options.slice(*FUZZY_MATCH_OPTIONS).merge({
181
+ stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
182
+ })
183
+ end
184
+
185
+ def fuzzy_read_method
186
+ fuzzy_match_options[:read]
187
+ end
188
+
189
+ end
190
+
191
+ class << self
192
+ include FuzzyMatch::ClassMethods
193
+ end
194
+
195
+ end
196
+ end
197
+ end
198
+
199
+ require_relative 'fuzzy_match/score'
200
+ require_relative 'fuzzy_match/result'
201
+ require_relative 'fuzzy_match/results'
@@ -0,0 +1,75 @@
1
+ module Eco
2
+ module Data
3
+ module FuzzyMatch
4
+ module ArrayHelpers
5
+ # Keeps the start order of the `values` and consecutive `values` together/consecutive.
6
+ # @param values [Array] the input array with the values.
7
+ # @param range [Integer, Range] determine the lenght of the generated values.
8
+ # @return [Array<Array<Value>>] combinations of `range` length of `values`.
9
+ def ngrams(values, range=2..3)
10
+ [].tap do |out|
11
+ if range.is_a?(Integer)
12
+ n = range
13
+ values_count = values.length
14
+ values.each_with_index do |word, i|
15
+ min = i
16
+ max = i + (n - 1)
17
+ break if values_count <= max
18
+ out << values[min..max].join(' ')
19
+ end
20
+ out.uniq!
21
+ else
22
+ range.each {|n| out.concat(ngrams(values, n))}
23
+ out.uniq!
24
+ end
25
+ end
26
+ end
27
+
28
+ # Keeps the start order of the `values` of the input `Array` `values`.
29
+ # It does **not** keep consecutive `values` together (it can jump/skip items).
30
+ # @param values [Array] the input array with the values.
31
+ # @param range [Integer, Range] determine the lenght of the generated values.
32
+ # @return [Array<Array<Value>>] combinations of `range` length of `values`
33
+ def combinations(values, range=2..3)
34
+ if range.is_a?(Integer)
35
+ values.combination(range).to_a
36
+ else
37
+ range.flat_map {|size| values.combination(size).to_a}
38
+ end
39
+ end
40
+
41
+ # It includes `combinations` that break the initial order of the `Array`.
42
+ # It does **not** keep consecutive `values` together (it can jump/skip items).
43
+ # @param values [Array] the input array with the values.
44
+ # @param range [Integer, Range] determine the lenght of the generated values.
45
+ # @return [Array<Array<Value>>] permutations of `range` length of `values`
46
+ def permutations(values, range=2..3)
47
+ combinations(values, range).tap do |out|
48
+ range = range.is_a?(Integer)? (range..range) : range
49
+ out.dup.select do |item|
50
+ range.include?(item.length)
51
+ end.each do |comb|
52
+ comb.permutation.to_a.tap do |perms|
53
+ perms.each {|perm| out << perm}
54
+ end
55
+ end
56
+ out.uniq!
57
+ end
58
+ end
59
+
60
+ # Helper to praper facet structure
61
+ # @param values1 [Array] the input array with the values to have their facet against.
62
+ # @param values2 [Array] the input array with the values to facet against.
63
+ # @return [Hash] where `keys` are `values1` and `value` of each `key` all `values2`
64
+ def facet(values1, values2)
65
+ {}.tap do |out|
66
+ next unless values1.is_a?(Enumerable)
67
+ values1 = values1.is_a?(Hash) ? values1.values : values1.to_a
68
+ values1.each {|val| out[val] = values2.dup}
69
+ end
70
+ end
71
+
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,38 @@
1
+ module Eco
2
+ module Data
3
+ module FuzzyMatch
4
+ module CharsPositionScore
5
+ # For each character in `str1`, a search is performed on `str2`.
6
+ # The search is deemed successful if a character is found in `str2` within `max_distance` characters of the current position.
7
+ # A score is kept of matching characters.
8
+ # @note This algorithm is best suited for matching mis-spellings.
9
+ # @max_distance [Integer] maximum char position distance to score.
10
+ # @normalized [Boolean] to avoid double ups in normalizing.
11
+ # @return [Score] the score object with the result.
12
+ def chars_position_score(str1, str2, max_distance: 3, normalized: false)
13
+ str1, str2 = normalize_string([str1, str2]) unless normalized
14
+ len1 = str1 && str1.length; len2 = str2 && str2.length
15
+ Score.new(0, 0).tap do |score|
16
+ next if !str2 || !str1 || str2.empty? || str1.empty?
17
+ score.total = len1
18
+ next score.increase(score.total) if str1 == str2
19
+ next if len1 < 2
20
+ pos = 0
21
+ len1.times do |i|
22
+ start = pos + 1
23
+ found = false
24
+ if pos = str2.index(str1[i])
25
+ if pos < (start + max_distance)
26
+ found = true
27
+ score.increase
28
+ end
29
+ end
30
+ pos = start unless found
31
+ end
32
+ end
33
+ end
34
+
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,82 @@
1
+ module Eco
2
+ module Data
3
+ module FuzzyMatch
4
+ module NGramsScore
5
+ # It does the following:
6
+ # 1. It splits both strings into words
7
+ # 2. Pairs all words by best `ngrams_score` match
8
+ # 3. Gives `0` score to those words of `str2` that lost their pair (a word of `str1` cannot be paired twice)
9
+ # 4. Merges the `ngrams_score` of all the paired words of `str2` against their `str1` word pair
10
+ # @param range [Integer, Range] determine the lenght of the generated values for each `word`.
11
+ # @normalized [Boolean] to avoid double ups in normalizing.
12
+ # @return [Score] the score object with the result.
13
+ def words_ngrams_score(str1, str2, range: 3..5, normalized: false)
14
+ str1, str2 = normalize_string([str1, str2]) unless normalized
15
+ len1 = str1 && str1.length; len2 = str2 && str2.length
16
+
17
+ Score.new(0, 0).tap do |score|
18
+ next if !str2 || !str1
19
+ next score.increase_total(len1) if str2.empty? || str1.empty?
20
+ if str1 == str2
21
+ score.total = len1
22
+ score.increase(score.total)
23
+ end
24
+ if str1.length < 2 || str1.length < 2
25
+ score.increase_total(len1)
26
+ end
27
+
28
+ pairs = paired_words(str1, str2, normalized: true) do |needle, item|
29
+ ngrams_score(needle, item, range: range, normalized: true)
30
+ end.each do |sub_str1, data|
31
+ item, iscore = data
32
+ score.merge!(iscore)
33
+ end
34
+ end
35
+ end
36
+
37
+ # A score is kept of matching ngram combinations of `str2`.
38
+ # @note This algorithm is best suited for matching sentences, or 'firstname lastname' compared with 'lastname firstname' combinations.
39
+ # @param range [Integer, Range] determine the lenght of the generated values.
40
+ # @normalized [Boolean] to avoid double ups in normalizing.
41
+ # @return [Score] the score object with the result.
42
+ def ngrams_score(str1, str2, range: 3..5, normalized: false)
43
+ str1, str2 = normalize_string([str1, str2]) unless normalized
44
+ len1 = str1 && str1.length; len2 = str2 && str2.length
45
+
46
+ Score.new(0, len1 || 0).tap do |score|
47
+ next if !str2 || !str1
48
+ next if str2.empty? || str1.empty?
49
+ score.total = len1
50
+ next score.increase(score.total) if str1 == str2
51
+ next if str1.length < 2 || str2.length < 2
52
+
53
+ grams = word_ngrams(str2, range, normalized: true)
54
+ grams_count = grams.length
55
+ next unless grams_count > 0
56
+
57
+ if range.is_a?(Integer)
58
+ item_weight = score.total.to_f / grams_count
59
+ matches = grams.select {|res| str1.include?(gram)}.length
60
+ score.increase(matches * item_weight)
61
+ else
62
+ groups = grams.group_by {|gram| gram.length}
63
+ sorted_lens = groups.keys.sort.reverse
64
+ lens = sorted_lens.length
65
+ group_weight = (1.0 / lens).round(3)
66
+
67
+ groups.each do |len, grams|
68
+ len_max_score = score.total * group_weight
69
+ item_weight = len_max_score / grams_count
70
+ matches = grams.select {|gram| str1.include?(gram)}.length
71
+ #pp "(#{len}) match: #{matches} (of #{grams.length} of total #{grams_count}) || max_score: #{len_max_score} (over #{score.total})"
72
+ score.increase(matches * item_weight)
73
+ end
74
+ end
75
+
76
+ end
77
+ end
78
+
79
+ end
80
+ end
81
+ end
82
+ end