eco-helpers 2.0.15 → 2.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +109 -3
  3. data/eco-helpers.gemspec +11 -5
  4. data/lib/eco-helpers.rb +2 -0
  5. data/lib/eco/api/common/base_loader.rb +14 -0
  6. data/lib/eco/api/common/loaders/parser.rb +1 -0
  7. data/lib/eco/api/common/people/default_parsers/date_parser.rb +11 -1
  8. data/lib/eco/api/common/people/default_parsers/login_providers_parser.rb +1 -1
  9. data/lib/eco/api/common/people/default_parsers/policy_groups_parser.rb +11 -11
  10. data/lib/eco/api/common/people/entries.rb +1 -0
  11. data/lib/eco/api/common/people/entry_factory.rb +74 -23
  12. data/lib/eco/api/common/people/person_entry.rb +5 -2
  13. data/lib/eco/api/common/people/supervisor_helpers.rb +27 -0
  14. data/lib/eco/api/common/session.rb +1 -0
  15. data/lib/eco/api/common/session/base_session.rb +2 -0
  16. data/lib/eco/api/common/session/file_manager.rb +2 -2
  17. data/lib/eco/api/common/session/helpers.rb +30 -0
  18. data/lib/eco/api/common/session/helpers/prompt_user.rb +34 -0
  19. data/lib/eco/api/common/session/mailer.rb +0 -1
  20. data/lib/eco/api/common/session/s3_uploader.rb +0 -1
  21. data/lib/eco/api/common/session/sftp.rb +0 -1
  22. data/lib/eco/api/common/version_patches/ecoportal_api/external_person.rb +1 -1
  23. data/lib/eco/api/common/version_patches/ecoportal_api/internal_person.rb +7 -4
  24. data/lib/eco/api/common/version_patches/exception.rb +11 -4
  25. data/lib/eco/api/microcases.rb +3 -1
  26. data/lib/eco/api/microcases/append_usergroups.rb +0 -1
  27. data/lib/eco/api/microcases/people_cache.rb +2 -2
  28. data/lib/eco/api/microcases/people_load.rb +2 -2
  29. data/lib/eco/api/microcases/people_refresh.rb +2 -2
  30. data/lib/eco/api/microcases/people_search.rb +6 -6
  31. data/lib/eco/api/microcases/preserve_default_tag.rb +23 -0
  32. data/lib/eco/api/microcases/preserve_filter_tags.rb +28 -0
  33. data/lib/eco/api/microcases/preserve_policy_groups.rb +30 -0
  34. data/lib/eco/api/microcases/set_account.rb +0 -1
  35. data/lib/eco/api/microcases/with_each.rb +67 -6
  36. data/lib/eco/api/microcases/with_each_present.rb +4 -2
  37. data/lib/eco/api/microcases/with_each_starter.rb +4 -2
  38. data/lib/eco/api/organization.rb +1 -0
  39. data/lib/eco/api/organization/people.rb +98 -22
  40. data/lib/eco/api/organization/people_similarity.rb +272 -0
  41. data/lib/eco/api/organization/person_schemas.rb +5 -1
  42. data/lib/eco/api/organization/policy_groups.rb +5 -1
  43. data/lib/eco/api/organization/presets_factory.rb +40 -80
  44. data/lib/eco/api/organization/presets_integrity.json +6 -0
  45. data/lib/eco/api/organization/presets_values.json +5 -4
  46. data/lib/eco/api/organization/tag_tree.rb +33 -0
  47. data/lib/eco/api/policies/default_policies/99_user_access_policy.rb +0 -30
  48. data/lib/eco/api/session.rb +10 -24
  49. data/lib/eco/api/session/batch.rb +25 -7
  50. data/lib/eco/api/session/config.rb +16 -15
  51. data/lib/eco/api/session/config/api.rb +4 -0
  52. data/lib/eco/api/session/config/apis.rb +80 -0
  53. data/lib/eco/api/session/config/files.rb +7 -0
  54. data/lib/eco/api/session/config/people.rb +3 -19
  55. data/lib/eco/api/usecases/default_cases.rb +4 -1
  56. data/lib/eco/api/usecases/default_cases/abstract_policygroup_abilities_case.rb +161 -0
  57. data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +223 -0
  58. data/lib/eco/api/usecases/default_cases/clean_unknown_tags_case.rb +37 -0
  59. data/lib/eco/api/usecases/default_cases/codes_to_tags_case.rb +2 -3
  60. data/lib/eco/api/usecases/default_cases/reset_landing_page_case.rb +11 -1
  61. data/lib/eco/api/usecases/default_cases/restore_db_case.rb +1 -2
  62. data/lib/eco/api/usecases/default_cases/supers_cyclic_identify_case.rb +72 -0
  63. data/lib/eco/api/usecases/default_cases/supers_hierarchy_case.rb +1 -1
  64. data/lib/eco/api/usecases/default_cases/to_csv_case.rb +132 -29
  65. data/lib/eco/api/usecases/default_cases/to_csv_detailed_case.rb +61 -36
  66. data/lib/eco/api/usecases/ooze_samples/ooze_update_case.rb +3 -2
  67. data/lib/eco/cli.rb +0 -10
  68. data/lib/eco/cli/config/default/options.rb +48 -17
  69. data/lib/eco/cli/config/default/people.rb +18 -24
  70. data/lib/eco/cli/config/default/people_filters.rb +3 -3
  71. data/lib/eco/cli/config/default/usecases.rb +105 -28
  72. data/lib/eco/cli/config/default/workflow.rb +21 -12
  73. data/lib/eco/cli/config/help.rb +1 -0
  74. data/lib/eco/cli/config/options_set.rb +106 -13
  75. data/lib/eco/cli/config/use_cases.rb +33 -33
  76. data/lib/eco/cli/scripting/args_helpers.rb +30 -3
  77. data/lib/eco/csv.rb +4 -2
  78. data/lib/eco/csv/table.rb +121 -21
  79. data/lib/eco/data.rb +1 -0
  80. data/lib/eco/data/crypto/encryption.rb +3 -3
  81. data/lib/eco/data/files/directory.rb +28 -20
  82. data/lib/eco/data/files/helpers.rb +6 -4
  83. data/lib/eco/data/fuzzy_match.rb +201 -0
  84. data/lib/eco/data/fuzzy_match/array_helpers.rb +75 -0
  85. data/lib/eco/data/fuzzy_match/chars_position_score.rb +38 -0
  86. data/lib/eco/data/fuzzy_match/ngrams_score.rb +82 -0
  87. data/lib/eco/data/fuzzy_match/pairing.rb +95 -0
  88. data/lib/eco/data/fuzzy_match/result.rb +87 -0
  89. data/lib/eco/data/fuzzy_match/results.rb +77 -0
  90. data/lib/eco/data/fuzzy_match/score.rb +49 -0
  91. data/lib/eco/data/fuzzy_match/stop_words.rb +35 -0
  92. data/lib/eco/data/fuzzy_match/string_helpers.rb +82 -0
  93. data/lib/eco/version.rb +1 -1
  94. metadata +168 -11
  95. data/lib/eco/api/microcases/refresh_abilities.rb +0 -19
  96. data/lib/eco/api/organization/presets_reference.json +0 -59
  97. data/lib/eco/api/usecases/default_cases/refresh_abilities_case.rb +0 -30
data/lib/eco/data.rb CHANGED
@@ -6,3 +6,4 @@ end
6
6
  require_relative 'data/crypto'
7
7
  require_relative 'data/files'
8
8
  require_relative 'data/mapper'
9
+ require_relative 'data/fuzzy_match'
@@ -1,7 +1,7 @@
1
1
  require 'openssl'
2
- require 'json'
2
+ # 'json'
3
3
  require 'base64'
4
- require 'pp'
4
+ #require 'pp'
5
5
 
6
6
  require_relative '../../cli/scripting'
7
7
 
@@ -147,7 +147,7 @@ module Eco
147
147
  return str_c
148
148
  #EncryptedData.new({content: str_c, key: key, iv: iv})
149
149
 
150
-
150
+
151
151
  end
152
152
  def aes256_decrypt(data, key: , iv: , block_octets: BLOCK_OCTETS)
153
153
  block_bits = block_bits * 8
@@ -5,6 +5,29 @@ module Eco
5
5
  module Files
6
6
  class Directory
7
7
 
8
+ class << self
9
+ def create(path, includes_file: false)
10
+ return true if Files.file_exists?(path)
11
+
12
+ parts = Files.split(File.expand_path(path))
13
+ filename = parts.pop if includes_file
14
+
15
+ return true if Files.dir_exists?(File.join(*parts))
16
+
17
+ subpath = nil
18
+ begin
19
+ parts.each do |curr|
20
+ subpath = subpath ? File.join(subpath, curr) : curr
21
+ Dir.mkdir(subpath) unless Files.dir_exists?(subpath)
22
+ end
23
+ rescue Exception => e
24
+ pp e
25
+ return false
26
+ end
27
+ true
28
+ end
29
+ end
30
+
8
31
  attr_reader :dir_path
9
32
 
10
33
  def initialize(dir_path = Dir.pwd)
@@ -14,12 +37,14 @@ module Eco
14
37
  end
15
38
 
16
39
  def exists?
17
- Files.dir_exists(@dir_path)
40
+ Files.dir_exists?(@dir_path)
18
41
  end
19
42
 
20
43
  def create
21
- succeed = Directory.create(File.expand_path(@dir_path)) unless self.exists?
22
- self.full_path if succeed
44
+ return self.full_path if self.exists?
45
+ if succeed = Directory.create(File.expand_path(@dir_path))
46
+ return self.full_path
47
+ end
23
48
  end
24
49
 
25
50
  def full_path
@@ -57,23 +82,6 @@ module Eco
57
82
  File.join(*args)
58
83
  end
59
84
 
60
- def self.create(path, includes_file: false)
61
- return true if Files.file_exists?(path)
62
- parts = Files.split(File.expand_path(path))
63
- filename = parts.pop if includes_file
64
- return true if Files.dir_exists?(File.join(*parts))
65
- subpath = nil
66
- begin
67
- parts.each do |curr|
68
- subpath = subpath ? File.join(subpath, curr) : curr
69
- Dir.mkdir(subpath) unless Files.dir_exists?(subpath)
70
- end
71
- rescue Exception => e
72
- pp e
73
- end
74
- false
75
- end
76
-
77
85
  private
78
86
 
79
87
  def file_pattern(value)
@@ -3,11 +3,13 @@ module Eco
3
3
  module Files
4
4
  DEFAULT_TIMESTAMP_PATTERN = '%Y-%m-%dT%H%M%S'
5
5
 
6
- def self.included(base)
7
- base.send(:include, InstanceMethods)
8
- base.extend(ClassMethods)
6
+ class << self
7
+ def included(base)
8
+ base.send(:include, InstanceMethods)
9
+ base.extend(ClassMethods)
10
+ end
9
11
  end
10
-
12
+
11
13
  module InstanceMethods
12
14
 
13
15
  end
@@ -0,0 +1,201 @@
1
+ require 'fuzzy_match'
2
+ require 'amatch'
3
+ require 'jaro_winkler'
4
+
5
+ require_relative 'fuzzy_match/stop_words'
6
+ require_relative 'fuzzy_match/array_helpers'
7
+ require_relative 'fuzzy_match/string_helpers'
8
+ require_relative 'fuzzy_match/pairing'
9
+ require_relative 'fuzzy_match/chars_position_score'
10
+ require_relative 'fuzzy_match/ngrams_score'
11
+
12
+ module Eco
13
+ module Data
14
+ module FuzzyMatch
15
+
16
+ class << self
17
+ def included(base)
18
+ base.send(:include, InstanceMethods)
19
+ base.extend(ClassMethods)
20
+ end
21
+ end
22
+
23
+ module ClassMethods
24
+ include ArrayHelpers
25
+ include StringHelpers
26
+ include Pairing
27
+ include CharsPositionScore
28
+ include NGramsScore
29
+
30
+ def jaro_winkler(str1, str2, **options)
31
+ return 0 if !str1 || !str2
32
+ options = {
33
+ ignore_case: true,
34
+ weight: 0.25
35
+ }.merge(options)
36
+ JaroWinkler.distance(str1, str2, **options)
37
+ end
38
+
39
+ end
40
+
41
+ module InstanceMethods
42
+ FUZZY_MATCH_OPTIONS = [
43
+ :identities, :groupings, :stop_words, :read,
44
+ :must_match_grouping, :must_match_at_least_one_word,
45
+ :gather_last_result, :threshold
46
+ ]
47
+
48
+ JARO_OPTIONS = [:ignore_case, :weight]
49
+ NGRAMS_OPTIONS = [:range]
50
+ POSITION_OPTIONS = [:max_distance]
51
+ RESULTS_OPTIONS = [:order, :threshold]
52
+
53
+ include StopWords
54
+
55
+ attr_accessor :fuzzy_options
56
+
57
+ def fuzzy_options
58
+ @fuzzy_options ||= {}
59
+ end
60
+
61
+ def fuzzy_match(haystack_data = nil, **options)
62
+ if instance_variable_defined?(:@fuzzy_match) && !haystack_data
63
+ return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options)
64
+ end
65
+ @fuzzy_options = options
66
+ # make it run with a native C extension (for better performance: ~130 % increase of performance)
67
+ ::FuzzyMatch.engine = :amatch
68
+ @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
69
+ end
70
+
71
+ # TODO: integration for options[:unique_words] => to ensure repeated words do not bring down the score are cut by threshold
72
+ # @note
73
+ # - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results
74
+ # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key.
75
+ # @param needle_str [String, nil] the actual value of needle_str to be used.
76
+ # @param haystack [Enumerable] the items to find `needle` among.
77
+ # @return [Eco::Data::FuzzyMatch::Results]
78
+ def find_all_with_score(needle, needle_str: nil, haystack: nil, **options)
79
+ base_match = fuzzy_match(haystack, **options)
80
+ match_results = base_match.find_all_with_score(needle_str || needle)
81
+ needle_str ||= item_string(needle)
82
+ results = match_results.each_with_object([]) do |fuzzy_results, results|
83
+ item, dice, lev = fuzzy_results
84
+ unless item == needle
85
+ item_str = item_string(item)
86
+
87
+ if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty?
88
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0
89
+ end
90
+
91
+ jaro_res ||= jaro(needle_str, item_str)
92
+ ngram_res ||= ngram(needle_str, item_str)
93
+ wngram_res ||= words_ngram(needle_str, item_str)
94
+ pos_res ||= position(needle_str, item_str)
95
+
96
+ results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
97
+ end
98
+ end
99
+ Results.new(needle, needle_str, results).tap do |res|
100
+ res.order = fuzzy_options[:order] if fuzzy_options[:order]
101
+ res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
102
+ end.relevant_results
103
+ end
104
+
105
+ def recalculate_results(results, needle_str: nil, **options)
106
+ raise "You should provide a block |needle_str, item_str, needle, item|" unless block_given?
107
+ new_results = results.each_with_object([]) do |result, new_results|
108
+ nstr, istr = yield(needle_str || results.value, result.value, results.needle, result.match)
109
+
110
+ if istr.to_s.strip.empty?
111
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 1
112
+ elsif nstr.to_s.strip.empty?
113
+ unless istr = needle_str
114
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0
115
+ end
116
+ end
117
+
118
+ res = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev
119
+ dice ||= res&.dices_coefficient_similar || 0
120
+ lev ||= res&.levenshtein_similar || 0
121
+ jaro_res ||= jaro(nstr, istr)
122
+ ngram_res ||= ngram(nstr, istr)
123
+ wngram_res ||= words_ngram(nstr, istr)
124
+ pos_res ||= position(nstr, istr)
125
+
126
+ new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
127
+ end
128
+ Results.new(results.needle, results.value, new_results).tap do |res|
129
+ res.order = options[:order] if options[:order]
130
+ res.threshold = options[:threshold] if options[:threshold]
131
+ end.relevant_results
132
+ end
133
+
134
+ private
135
+
136
+ def jaro(str1, str2)
137
+ options = fuzzy_options.slice(*JARO_OPTIONS)
138
+ self.class.jaro_winkler(str1, str2, **options)
139
+ end
140
+
141
+ def ngram(str1, str2)
142
+ options = { range: 3..5 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
143
+ self.class.ngrams_score(str1, str2, **options).ratio
144
+ end
145
+
146
+ def words_ngram(str1, str2)
147
+ options = { range: 3..7 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
148
+ self.class.words_ngrams_score(str1, str2, **options).ratio
149
+ end
150
+
151
+ def position(str1, str2)
152
+ options = fuzzy_options.slice(*POSITION_OPTIONS)
153
+ self.class.chars_position_score(str1, str2, **options).ratio
154
+ end
155
+
156
+ # @note
157
+ # - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash`
158
+ # @param data [Enumerable, nil]
159
+ # @return [Array<Object>] the non-repeated values of `data`
160
+ def haystack(data = nil)
161
+ data = self if self.is_a?(Enumerable) && !data
162
+ raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable)
163
+ data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten
164
+ data.uniq.compact.tap do |items|
165
+ if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
166
+ raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
167
+ end
168
+ end
169
+ end
170
+
171
+ def item_string(item, attr = fuzzy_read_method)
172
+ return item if !item || item.is_a?(String) || !attr
173
+ return attr.call(item) if attr.is_a?(Proc)
174
+ attr = attr.to_sym
175
+ return item.send(attr) if item.respond_to?(attr)
176
+ end
177
+
178
+ def fuzzy_match_options(options = nil)
179
+ options = fuzzy_options unless options
180
+ options.slice(*FUZZY_MATCH_OPTIONS).merge({
181
+ stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
182
+ })
183
+ end
184
+
185
+ def fuzzy_read_method
186
+ fuzzy_match_options[:read]
187
+ end
188
+
189
+ end
190
+
191
+ class << self
192
+ include FuzzyMatch::ClassMethods
193
+ end
194
+
195
+ end
196
+ end
197
+ end
198
+
199
+ require_relative 'fuzzy_match/score'
200
+ require_relative 'fuzzy_match/result'
201
+ require_relative 'fuzzy_match/results'
@@ -0,0 +1,75 @@
1
+ module Eco
2
+ module Data
3
+ module FuzzyMatch
4
+ module ArrayHelpers
5
+ # Keeps the start order of the `values` and consecutive `values` together/consecutive.
6
+ # @param values [Array] the input array with the values.
7
+ # @param range [Integer, Range] determine the lenght of the generated values.
8
+ # @return [Array<Array<Value>>] combinations of `range` length of `values`.
9
+ def ngrams(values, range=2..3)
10
+ [].tap do |out|
11
+ if range.is_a?(Integer)
12
+ n = range
13
+ values_count = values.length
14
+ values.each_with_index do |word, i|
15
+ min = i
16
+ max = i + (n - 1)
17
+ break if values_count <= max
18
+ out << values[min..max].join(' ')
19
+ end
20
+ out.uniq!
21
+ else
22
+ range.each {|n| out.concat(ngrams(values, n))}
23
+ out.uniq!
24
+ end
25
+ end
26
+ end
27
+
28
+ # Keeps the start order of the `values` of the input `Array` `values`.
29
+ # It does **not** keep consecutive `values` together (it can jump/skip items).
30
+ # @param values [Array] the input array with the values.
31
+ # @param range [Integer, Range] determine the lenght of the generated values.
32
+ # @return [Array<Array<Value>>] combinations of `range` length of `values`
33
+ def combinations(values, range=2..3)
34
+ if range.is_a?(Integer)
35
+ values.combination(range).to_a
36
+ else
37
+ range.flat_map {|size| values.combination(size).to_a}
38
+ end
39
+ end
40
+
41
+ # It includes `combinations` that break the initial order of the `Array`.
42
+ # It does **not** keep consecutive `values` together (it can jump/skip items).
43
+ # @param values [Array] the input array with the values.
44
+ # @param range [Integer, Range] determine the lenght of the generated values.
45
+ # @return [Array<Array<Value>>] permutations of `range` length of `values`
46
+ def permutations(values, range=2..3)
47
+ combinations(values, range).tap do |out|
48
+ range = range.is_a?(Integer)? (range..range) : range
49
+ out.dup.select do |item|
50
+ range.include?(item.length)
51
+ end.each do |comb|
52
+ comb.permutation.to_a.tap do |perms|
53
+ perms.each {|perm| out << perm}
54
+ end
55
+ end
56
+ out.uniq!
57
+ end
58
+ end
59
+
60
+ # Helper to praper facet structure
61
+ # @param values1 [Array] the input array with the values to have their facet against.
62
+ # @param values2 [Array] the input array with the values to facet against.
63
+ # @return [Hash] where `keys` are `values1` and `value` of each `key` all `values2`
64
+ def facet(values1, values2)
65
+ {}.tap do |out|
66
+ next unless values1.is_a?(Enumerable)
67
+ values1 = values1.is_a?(Hash) ? values1.values : values1.to_a
68
+ values1.each {|val| out[val] = values2.dup}
69
+ end
70
+ end
71
+
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,38 @@
1
+ module Eco
2
+ module Data
3
+ module FuzzyMatch
4
+ module CharsPositionScore
5
+ # For each character in `str1`, a search is performed on `str2`.
6
+ # The search is deemed successful if a character is found in `str2` within `max_distance` characters of the current position.
7
+ # A score is kept of matching characters.
8
+ # @note This algorithm is best suited for matching mis-spellings.
9
+ # @max_distance [Integer] maximum char position distance to score.
10
+ # @normalized [Boolean] to avoid double ups in normalizing.
11
+ # @return [Score] the score object with the result.
12
+ def chars_position_score(str1, str2, max_distance: 3, normalized: false)
13
+ str1, str2 = normalize_string([str1, str2]) unless normalized
14
+ len1 = str1 && str1.length; len2 = str2 && str2.length
15
+ Score.new(0, 0).tap do |score|
16
+ next if !str2 || !str1 || str2.empty? || str1.empty?
17
+ score.total = len1
18
+ next score.increase(score.total) if str1 == str2
19
+ next if len1 < 2
20
+ pos = 0
21
+ len1.times do |i|
22
+ start = pos + 1
23
+ found = false
24
+ if pos = str2.index(str1[i])
25
+ if pos < (start + max_distance)
26
+ found = true
27
+ score.increase
28
+ end
29
+ end
30
+ pos = start unless found
31
+ end
32
+ end
33
+ end
34
+
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,82 @@
1
+ module Eco
2
+ module Data
3
+ module FuzzyMatch
4
+ module NGramsScore
5
+ # It does the following:
6
+ # 1. It splits both strings into words
7
+ # 2. Pairs all words by best `ngrams_score` match
8
+ # 3. Gives `0` score to those words of `str2` that lost their pair (a word of `str1` cannot be paired twice)
9
+ # 4. Merges the `ngrams_score` of all the paired words of `str2` against their `str1` word pair
10
+ # @param range [Integer, Range] determine the lenght of the generated values for each `word`.
11
+ # @normalized [Boolean] to avoid double ups in normalizing.
12
+ # @return [Score] the score object with the result.
13
+ def words_ngrams_score(str1, str2, range: 3..5, normalized: false)
14
+ str1, str2 = normalize_string([str1, str2]) unless normalized
15
+ len1 = str1 && str1.length; len2 = str2 && str2.length
16
+
17
+ Score.new(0, 0).tap do |score|
18
+ next if !str2 || !str1
19
+ next score.increase_total(len1) if str2.empty? || str1.empty?
20
+ if str1 == str2
21
+ score.total = len1
22
+ score.increase(score.total)
23
+ end
24
+ if str1.length < 2 || str1.length < 2
25
+ score.increase_total(len1)
26
+ end
27
+
28
+ pairs = paired_words(str1, str2, normalized: true) do |needle, item|
29
+ ngrams_score(needle, item, range: range, normalized: true)
30
+ end.each do |sub_str1, data|
31
+ item, iscore = data
32
+ score.merge!(iscore)
33
+ end
34
+ end
35
+ end
36
+
37
+ # A score is kept of matching ngram combinations of `str2`.
38
+ # @note This algorithm is best suited for matching sentences, or 'firstname lastname' compared with 'lastname firstname' combinations.
39
+ # @param range [Integer, Range] determine the lenght of the generated values.
40
+ # @normalized [Boolean] to avoid double ups in normalizing.
41
+ # @return [Score] the score object with the result.
42
+ def ngrams_score(str1, str2, range: 3..5, normalized: false)
43
+ str1, str2 = normalize_string([str1, str2]) unless normalized
44
+ len1 = str1 && str1.length; len2 = str2 && str2.length
45
+
46
+ Score.new(0, len1 || 0).tap do |score|
47
+ next if !str2 || !str1
48
+ next if str2.empty? || str1.empty?
49
+ score.total = len1
50
+ next score.increase(score.total) if str1 == str2
51
+ next if str1.length < 2 || str2.length < 2
52
+
53
+ grams = word_ngrams(str2, range, normalized: true)
54
+ grams_count = grams.length
55
+ next unless grams_count > 0
56
+
57
+ if range.is_a?(Integer)
58
+ item_weight = score.total.to_f / grams_count
59
+ matches = grams.select {|res| str1.include?(gram)}.length
60
+ score.increase(matches * item_weight)
61
+ else
62
+ groups = grams.group_by {|gram| gram.length}
63
+ sorted_lens = groups.keys.sort.reverse
64
+ lens = sorted_lens.length
65
+ group_weight = (1.0 / lens).round(3)
66
+
67
+ groups.each do |len, grams|
68
+ len_max_score = score.total * group_weight
69
+ item_weight = len_max_score / grams_count
70
+ matches = grams.select {|gram| str1.include?(gram)}.length
71
+ #pp "(#{len}) match: #{matches} (of #{grams.length} of total #{grams_count}) || max_score: #{len_max_score} (over #{score.total})"
72
+ score.increase(matches * item_weight)
73
+ end
74
+ end
75
+
76
+ end
77
+ end
78
+
79
+ end
80
+ end
81
+ end
82
+ end