eco-helpers 2.0.12 → 2.0.17
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +80 -73
- data/eco-helpers.gemspec +6 -4
- data/lib/eco-helpers.rb +1 -0
- data/lib/eco/api/common/base_loader.rb +14 -0
- data/lib/eco/api/common/loaders/use_case.rb +1 -1
- data/lib/eco/api/common/people/default_parsers/date_parser.rb +11 -1
- data/lib/eco/api/common/people/default_parsers/login_providers_parser.rb +1 -1
- data/lib/eco/api/common/people/default_parsers/policy_groups_parser.rb +11 -11
- data/lib/eco/api/common/people/person_entry.rb +9 -2
- data/lib/eco/api/common/people/supervisor_helpers.rb +27 -0
- data/lib/eco/api/common/session/file_manager.rb +2 -2
- data/lib/eco/api/common/session/mailer.rb +0 -1
- data/lib/eco/api/common/session/s3_uploader.rb +0 -1
- data/lib/eco/api/common/session/sftp.rb +0 -1
- data/lib/eco/api/error.rb +5 -3
- data/lib/eco/api/microcases.rb +3 -1
- data/lib/eco/api/microcases/append_usergroups.rb +0 -1
- data/lib/eco/api/microcases/people_cache.rb +2 -2
- data/lib/eco/api/microcases/people_load.rb +2 -2
- data/lib/eco/api/microcases/people_refresh.rb +2 -2
- data/lib/eco/api/microcases/people_search.rb +6 -6
- data/lib/eco/api/microcases/preserve_default_tag.rb +23 -0
- data/lib/eco/api/microcases/preserve_filter_tags.rb +28 -0
- data/lib/eco/api/microcases/preserve_policy_groups.rb +30 -0
- data/lib/eco/api/microcases/set_account.rb +0 -1
- data/lib/eco/api/organization.rb +1 -0
- data/lib/eco/api/organization/people.rb +7 -0
- data/lib/eco/api/organization/people_analytics.rb +60 -0
- data/lib/eco/api/organization/presets_factory.rb +116 -93
- data/lib/eco/api/organization/presets_integrity.json +58 -0
- data/lib/eco/api/organization/presets_values.json +5 -4
- data/lib/eco/api/policies/default_policies/99_user_access_policy.rb +0 -30
- data/lib/eco/api/session.rb +1 -20
- data/lib/eco/api/session/batch.rb +42 -10
- data/lib/eco/api/session/batch/job.rb +3 -0
- data/lib/eco/api/session/config.rb +16 -15
- data/lib/eco/api/session/config/api.rb +4 -0
- data/lib/eco/api/session/config/apis.rb +14 -0
- data/lib/eco/api/session/config/files.rb +7 -0
- data/lib/eco/api/session/config/people.rb +3 -19
- data/lib/eco/api/usecases.rb +2 -0
- data/lib/eco/api/usecases/default_cases.rb +4 -1
- data/lib/eco/api/usecases/default_cases/abstract_policygroup_abilities_case.rb +161 -0
- data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +76 -0
- data/lib/eco/api/usecases/default_cases/codes_to_tags_case.rb +2 -3
- data/lib/eco/api/usecases/default_cases/hris_case.rb +14 -8
- data/lib/eco/api/usecases/default_cases/reset_landing_page_case.rb +11 -1
- data/lib/eco/api/usecases/default_cases/restore_db_case.rb +1 -2
- data/lib/eco/api/usecases/default_cases/supers_cyclic_identify_case.rb +72 -0
- data/lib/eco/api/usecases/default_cases/supers_hierarchy_case.rb +59 -0
- data/lib/eco/api/usecases/default_cases/to_csv_case.rb +104 -26
- data/lib/eco/api/usecases/default_cases/to_csv_detailed_case.rb +62 -36
- data/lib/eco/cli.rb +0 -10
- data/lib/eco/cli/config/default/options.rb +19 -17
- data/lib/eco/cli/config/default/people_filters.rb +3 -3
- data/lib/eco/cli/config/default/usecases.rb +77 -25
- data/lib/eco/cli/config/default/workflow.rb +6 -1
- data/lib/eco/cli/config/help.rb +1 -0
- data/lib/eco/cli/config/options_set.rb +106 -13
- data/lib/eco/cli/config/use_cases.rb +33 -33
- data/lib/eco/cli/scripting/args_helpers.rb +30 -3
- data/lib/eco/data.rb +1 -0
- data/lib/eco/data/crypto/encryption.rb +3 -3
- data/lib/eco/data/files/directory.rb +28 -20
- data/lib/eco/data/files/helpers.rb +6 -4
- data/lib/eco/data/fuzzy_match.rb +119 -0
- data/lib/eco/data/fuzzy_match/array_helpers.rb +75 -0
- data/lib/eco/data/fuzzy_match/chars_position_score.rb +37 -0
- data/lib/eco/data/fuzzy_match/ngrams_score.rb +73 -0
- data/lib/eco/data/fuzzy_match/pairing.rb +102 -0
- data/lib/eco/data/fuzzy_match/result.rb +67 -0
- data/lib/eco/data/fuzzy_match/results.rb +53 -0
- data/lib/eco/data/fuzzy_match/score.rb +44 -0
- data/lib/eco/data/fuzzy_match/stop_words.rb +35 -0
- data/lib/eco/data/fuzzy_match/string_helpers.rb +69 -0
- data/lib/eco/version.rb +1 -1
- metadata +86 -10
- data/lib/eco/api/microcases/refresh_abilities.rb +0 -19
- data/lib/eco/api/organization/presets_reference.json +0 -59
- data/lib/eco/api/usecases/default_cases/refresh_abilities_case.rb +0 -30
@@ -0,0 +1,37 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
module CharsPositionScore
|
5
|
+
# For each character in `str1`, a search is performed on `str2`.
|
6
|
+
# The search is deemed successful if a character is found in `str2` within `max_distance` characters of the current position.
|
7
|
+
# A score is kept of matching characters.
|
8
|
+
# @note This algorithm is best suited for matching mis-spellings.
|
9
|
+
# @max_distance [Integer] maximum char position distance to score.
|
10
|
+
# @normalized [Boolean] to avoid double ups in normalizing.
|
11
|
+
# @return [Score] the score object with the result.
|
12
|
+
def chars_position_score(str1, str2, max_distance: 3, normalized: false)
|
13
|
+
str1, str2 = normalize_string([str1, str2]) unless normalized
|
14
|
+
len1 = str1 && str1.length; len2 = str2 && str2.length
|
15
|
+
Score.new(0, len1 || 0).tap do |score|
|
16
|
+
next if !str1 || !str2
|
17
|
+
next score.increase(score.total) if str1 == str2
|
18
|
+
next if len1 < 2
|
19
|
+
pos = 0
|
20
|
+
len1.times do |i|
|
21
|
+
start = pos + 1
|
22
|
+
found = false
|
23
|
+
if pos = str2.index(str1[i])
|
24
|
+
if pos < (start + max_distance)
|
25
|
+
found = true
|
26
|
+
score.increase
|
27
|
+
end
|
28
|
+
end
|
29
|
+
pos = start unless found
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
module NGramsScore
|
5
|
+
# It does the following:
|
6
|
+
# 1. It splits both strings into words
|
7
|
+
# 2. Pairs all words by best `ngrams_score` match
|
8
|
+
# 3. Gives `0` score to those words of `str2` that lost their pair (a word of `str1` cannot be paired twice)
|
9
|
+
# 4. Merges the `ngrams_score` of all the paired words of `str2` against their `str1` word pair
|
10
|
+
# @param range [Integer, Range] determine the lenght of the generated values for each `word`.
|
11
|
+
# @normalized [Boolean] to avoid double ups in normalizing.
|
12
|
+
# @return [Score] the score object with the result.
|
13
|
+
def words_ngrams_score(str1, str2, range: 3..5, normalized: false)
|
14
|
+
str1, str2 = normalize_string([str1, str2]) unless normalized
|
15
|
+
len1 = str1 && str1.length; len2 = str2 && str2.length
|
16
|
+
|
17
|
+
Score.new(0, 0).tap do |score|
|
18
|
+
next if !str2 || !str1
|
19
|
+
next score.increase(score.total) if str1 == str2
|
20
|
+
next if str1.length < 2 || str1.length < 2
|
21
|
+
|
22
|
+
paired_words(str1, str2, normalized: true) do |needle, item|
|
23
|
+
ngrams_score(needle, item, range: range, normalized: true)
|
24
|
+
end.each do |sub_str1, (item, iscore)|
|
25
|
+
#puts "pairs '#{sub_str1}' --> '#{item}' (score: #{iscore.ratio})"
|
26
|
+
score.merge!(iscore)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# A score is kept of matching ngram combinations of `str2`.
|
32
|
+
# @note This algorithm is best suited for matching sentences, or 'firstname lastname' compared with 'lastname firstname' combinations.
|
33
|
+
# @param range [Integer, Range] determine the lenght of the generated values.
|
34
|
+
# @normalized [Boolean] to avoid double ups in normalizing.
|
35
|
+
# @return [Score] the score object with the result.
|
36
|
+
def ngrams_score(str1, str2, range: 3..5, normalized: false)
|
37
|
+
str1, str2 = normalize_string([str1, str2]) unless normalized
|
38
|
+
len1 = str1 && str1.length; len2 = str2 && str2.length
|
39
|
+
|
40
|
+
Score.new(0, len1 || 0).tap do |score|
|
41
|
+
next if !str2 || !str1
|
42
|
+
next score.increase(score.total) if str1 == str2
|
43
|
+
next if str1.length < 2 || str2.length < 2
|
44
|
+
|
45
|
+
grams = word_ngrams(str2, range, normalized: true)
|
46
|
+
next unless grams.length > 0
|
47
|
+
|
48
|
+
if range.is_a?(Integer)
|
49
|
+
item_weight = score.total.to_f / grams.length
|
50
|
+
matches = grams.select {|res| str1.include?(gram)}.length
|
51
|
+
score.increase(matches * item_weight)
|
52
|
+
else
|
53
|
+
groups = grams.group_by {|gram| gram.length}
|
54
|
+
sorted_lens = groups.keys.sort.reverse
|
55
|
+
lens = sorted_lens.length
|
56
|
+
group_weight = (1.0 / lens).round(3)
|
57
|
+
|
58
|
+
groups.each do |len, grams|
|
59
|
+
len_max_score = score.total * group_weight
|
60
|
+
item_weight = len_max_score / grams.length
|
61
|
+
matches = grams.select {|gram| str1.include?(gram)}.length
|
62
|
+
#pp "#{len} match: #{matches} (over #{grams.length}) || max_score: #{len_max_score} (over #{score.total})"
|
63
|
+
score.increase(matches * item_weight)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
module Pairing
|
5
|
+
|
6
|
+
# Pair words using some algorithm.
|
7
|
+
# It does the following:
|
8
|
+
# 1. It splits both strings into words.
|
9
|
+
# 2. Pairs all words by using `block` to score the best match.
|
10
|
+
# 3. Gives `0` score to those words of `str2` that lost their pair (a word of `str1` cannot be paired twice).
|
11
|
+
# 4. Merges the `Score` of all the paired words of `str2` against their `str1` word pair.
|
12
|
+
# @yield [needle, item] offers a comparison algorithm between two strings.
|
13
|
+
# @yieldparam needle [String] the string of reference.
|
14
|
+
# @yieldparam item [String] one of the haystack items.
|
15
|
+
# @yieldreturn [Eco::Data::FuzzyMatch::Score] the `Score` object with the results of comparing `str1` and `str2`
|
16
|
+
# @param str1 [String] the string of reference.
|
17
|
+
# @param str2 [String] one of the haystack items.
|
18
|
+
# @param format [Symbol] determines the `values` of the returned `Hash`::
|
19
|
+
# 1. `:pair` for just pair
|
20
|
+
# 2. `:score` for just score
|
21
|
+
# 2. `[:pair, :score]` for `Array`
|
22
|
+
# @normalized [Boolean] to avoid double ups in normalizing.
|
23
|
+
# @return [Hash] where `keys` are the **words** of `str1` and their `values`:
|
24
|
+
# 1. if `format` is `:pair` => the `str2` words with highest match.
|
25
|
+
# 2. if `format` is `:score` => the `Score` words with highest match.
|
26
|
+
# 3. if `format` is `[:pair, :score]` => both in an `Array`.
|
27
|
+
def paired_words(str1, str2, format: [:pair, :score], normalized: false)
|
28
|
+
str1, str2 = normalize_string([str1, str2]) unless normalized
|
29
|
+
return {} if !str2 || !str1
|
30
|
+
return score.increase(score.total) if str1 == str2
|
31
|
+
return {str1 => nil} if str1.length < 2 || str1.length < 2
|
32
|
+
|
33
|
+
needles = get_words(str1, normalized: true)
|
34
|
+
haystack = get_words(str2, normalized: true)
|
35
|
+
|
36
|
+
ranking = {}
|
37
|
+
faceted = needles.each_with_object({}) do |needle, faceted|
|
38
|
+
faceted[needle] = haystack.map do |item|
|
39
|
+
{
|
40
|
+
pair: item,
|
41
|
+
score: yield(needle, item)
|
42
|
+
}.tap do |result|
|
43
|
+
ranking[item] ||= []
|
44
|
+
if result[:score].ratio > 0.05
|
45
|
+
ranking[item] << ({needle: needle, score: result[:score]})
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end.sort_by do |result|
|
49
|
+
result[:score].ratio
|
50
|
+
end.reverse
|
51
|
+
end
|
52
|
+
|
53
|
+
paired = {}
|
54
|
+
#scores = {}
|
55
|
+
ranking.each do |item, results|
|
56
|
+
sorted = results.reject do |result|
|
57
|
+
paired.key?(result[:needle])
|
58
|
+
end.sort_by do |result|
|
59
|
+
result[:score].ratio
|
60
|
+
end.reverse
|
61
|
+
if result = sorted.shift
|
62
|
+
paired[result[:needle]] = {
|
63
|
+
pair: item,
|
64
|
+
score: result[:score]
|
65
|
+
}
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
pending_items = haystack - paired.values
|
70
|
+
faceted.reject do |needle, results|
|
71
|
+
paired.key?(needle)
|
72
|
+
end.each do |needle, results|
|
73
|
+
results.select! do |result|
|
74
|
+
pending_items.include?(result[:pair]) && result[:score].ratio > 0.05
|
75
|
+
end
|
76
|
+
if result = results.shift
|
77
|
+
paired[needle] = result
|
78
|
+
pending_items.delete(result[:pair])
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
pending_needles = needles - paired.keys
|
83
|
+
pending_needles.each do |needle|
|
84
|
+
paired[needle] = {
|
85
|
+
pair: nil,
|
86
|
+
score: Score.new(0, needle.length)
|
87
|
+
}
|
88
|
+
end
|
89
|
+
paired.transform_values do |result|
|
90
|
+
case format
|
91
|
+
when Array
|
92
|
+
result.values_at(*format)
|
93
|
+
else
|
94
|
+
restult[format]
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
class Result < Struct.new(:match, :value, :dice, :levenshtein, :jaro_winkler, :ngrams, :words_ngrams, :chars_position)
|
5
|
+
ALL_METHODS = [:dice, :levenshtein, :jaro_winkler, :ngrams, :words_ngrams, :chars_position]
|
6
|
+
|
7
|
+
def dice; super&.round(3); end
|
8
|
+
def levenshtein; super&.round(3); end
|
9
|
+
def jaro_winkler; super&.round(3); end
|
10
|
+
def ngrams; super&.round(3); end
|
11
|
+
def words_ngrams; super&.round(3); end
|
12
|
+
def chars_position; super&.round(3); end
|
13
|
+
|
14
|
+
# TODO: print in the order of `order`
|
15
|
+
def print
|
16
|
+
msg = "(Dice: #{dice}) (Lev Dst: #{levenshtein}) "
|
17
|
+
msg << "(Jaro: #{jaro_winkler}) "
|
18
|
+
msg << "(Ngram: #{ngrams}) (WNgrams: #{words_ngrams}) "
|
19
|
+
msg << "(C Pos: #{chars_position}) "
|
20
|
+
msg << "'#{value}'"
|
21
|
+
end
|
22
|
+
|
23
|
+
def all_threshold?(methods = order, threshold = 0.15)
|
24
|
+
return true unless threshold
|
25
|
+
[methods].flatten.compact.all? {|method| threshold?(method, threshold)}
|
26
|
+
end
|
27
|
+
|
28
|
+
def any_threshold?(methods = order, threshold = 0.15)
|
29
|
+
return true unless threshold
|
30
|
+
[methods].flatten.compact.any? {|method| threshold?(method, threshold)}
|
31
|
+
end
|
32
|
+
|
33
|
+
def threshold?(method = :dice, threshold = 0.15)
|
34
|
+
raise "Uknown method '#{method}'" unless self.respond_to?(method)
|
35
|
+
self.send(method) >= threshold
|
36
|
+
end
|
37
|
+
|
38
|
+
def order=(values)
|
39
|
+
@order = [values].flatten.compact.tap do |o|
|
40
|
+
o = [:words_ngrams, :dice] if o.empty?
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def order
|
45
|
+
@order ||= [:words_ngrams, :dice]
|
46
|
+
end
|
47
|
+
|
48
|
+
def <=>(result)
|
49
|
+
compare(result)
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def compare(other, order: self.order)
|
55
|
+
return 0 unless method = order.first
|
56
|
+
raise "Uknown method '#{method}'" unless self.respond_to?(method) && other.respond_to?(method)
|
57
|
+
return -1 if self.send(method) > other.send(method)
|
58
|
+
return 1 if self.send(method) < other.send(method)
|
59
|
+
compare(other, order: order[1..-1])
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
class Results < Struct.new(:needle, :value, :raw_results)
|
5
|
+
|
6
|
+
def results_with_false_positives
|
7
|
+
relevant_results(methods: :jaro_winkler, threshold: 0.5)
|
8
|
+
end
|
9
|
+
|
10
|
+
def relevant_results(methods: order, threshold: 0.5)
|
11
|
+
raw_results.select do |result|
|
12
|
+
result.all_threshold?(methods, threshold)
|
13
|
+
end.yield_self do |filtered|
|
14
|
+
self.class.new(needle, value, filtered).tap do |results|
|
15
|
+
results.order = methods
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def order=(values)
|
21
|
+
@order = [values].flatten.compact
|
22
|
+
raw_results.each {|r| r.order = @order}
|
23
|
+
end
|
24
|
+
|
25
|
+
def order
|
26
|
+
@order ||= [:words_ngrams, :dice]
|
27
|
+
end
|
28
|
+
|
29
|
+
def results
|
30
|
+
raw_results.sort
|
31
|
+
end
|
32
|
+
|
33
|
+
def print
|
34
|
+
msg = results.map do |result|
|
35
|
+
result.print
|
36
|
+
end.join("\n ")
|
37
|
+
|
38
|
+
puts "'#{value}':\n " + msg
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def item_string(item, attr = self.method)
|
44
|
+
return item if !item || item.is_a?(String) || !attr
|
45
|
+
attr = attr.to_sym
|
46
|
+
return item.send(attr) if item.respond_to?(attr)
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
class Score < Struct.new(:score, :total)
|
5
|
+
|
6
|
+
def ratio(decimals = 6)
|
7
|
+
((score || 0).to_f / (total || 1)).round(decimals)
|
8
|
+
end
|
9
|
+
|
10
|
+
def percent(decimals = 3)
|
11
|
+
(100 * ratio).round(decimals)
|
12
|
+
end
|
13
|
+
|
14
|
+
def increase(value = 1)
|
15
|
+
self.score += value
|
16
|
+
end
|
17
|
+
|
18
|
+
def increase_total(value)
|
19
|
+
self.total += value
|
20
|
+
end
|
21
|
+
|
22
|
+
def values_at(*keys)
|
23
|
+
keys.map do |key|
|
24
|
+
self.send(key) if self.respond_to?(key)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Merges 2 Score instance objects
|
29
|
+
def merge(value)
|
30
|
+
Score.new(*values_at(:score, :total)).merge!(value)
|
31
|
+
end
|
32
|
+
|
33
|
+
def merge!(value)
|
34
|
+
raise "Expecting Score object. Given: #{value.class}" unless value.is_a?(Score)
|
35
|
+
increase(value.score)
|
36
|
+
increase_total(value.total)
|
37
|
+
self
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
module StopWords
|
5
|
+
PREPOSITIONS = [
|
6
|
+
"aboard", "about", "above", "across", "after", "against", "along", "amid", "among", "around", "as", "at",
|
7
|
+
"before", "behind", "below", "beneath", "beside", "between", "beyond", "but", "by",
|
8
|
+
"concerning", "considering", "despite", "down", "during", "except", "following", "for", "from",
|
9
|
+
"in", "inside", "into", "like", "minus", "near", "next",
|
10
|
+
"of", "off", "on", "onto", "opposite", "out", "outside", "over", "past", "per", "plus",
|
11
|
+
"regarding", "round", "save", "since", "than", "through", "till", "to", "toward",
|
12
|
+
"under", "underneath", "unlike", "until", "up", "upon", "versus", "via",
|
13
|
+
"with", "within", "without"
|
14
|
+
]
|
15
|
+
PRONOUNS = [
|
16
|
+
"all", "another", "any", "anybody", "anyone", "anything", "as", "aught",
|
17
|
+
"both", "each", "each other", "either", "enough", "everybody", "everyone", "everything",
|
18
|
+
"few", "he", "her", "hers", "herself", "him", "himself", "his", "I", "idem", "it", "its", "itself",
|
19
|
+
"many", "me", "mine", "most", "my", "myself", "naught", "neither", "no one", "nobody", "none", "nothing", "nought",
|
20
|
+
"one", "one another", "other", "others", "ought", "our", "ours", "ourself", "ourselves",
|
21
|
+
"several", "she", "some", "somebody", "someone", "something", "somewhat", "such", "suchlike",
|
22
|
+
"that", "thee", "their", "theirs", "theirself", "theirselves", "them", "themself", "themselves", "there",
|
23
|
+
"these", "they", "thine", "this", "those", "thou", "thy", "thyself", "us",
|
24
|
+
"we", "what", "whatever", "whatnot", "whatsoever", "whence", "where", "whereby", "wherefrom",
|
25
|
+
"wherein", "whereinto", "whereof", "whereon", "wherever", "wheresoever", "whereto", "whereunto",
|
26
|
+
"wherewith", "wherewithal", "whether", "which", "whichever", "whichsoever", "who", "whoever", "whom",
|
27
|
+
"whomever", "whomso", "whomsoever", "whose", "whosever", "whosesoever", "whoso", "whosoever",
|
28
|
+
"ye", "yon", "yonder", "you", "your", "yours", "yourself", "yourselves"
|
29
|
+
]
|
30
|
+
ARTICLES = ["a", "an", "the"]
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
module StringHelpers
|
5
|
+
# Downcases and trims
|
6
|
+
def normalize_string(value)
|
7
|
+
case value
|
8
|
+
when Array
|
9
|
+
value.map {|val| normalize_string(val)}
|
10
|
+
when Symbol
|
11
|
+
normalize_string(value.to_sym)
|
12
|
+
when String
|
13
|
+
value.downcase.strip
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def get_words(str, normalized: false)
|
18
|
+
return [] unless str
|
19
|
+
str = normalize_string(str) unless normalized
|
20
|
+
str.scan(/[a-zA-Z'-]+/)
|
21
|
+
end
|
22
|
+
|
23
|
+
# Keeps the start order of the `words` and consecutive `words` together/consecutive.
|
24
|
+
# @param str [String] the input string with the words.
|
25
|
+
# @param range [Integer, Range] determine the lenght of the generated values.
|
26
|
+
# @return [Array<String>] combinations of `range` length of `words`.
|
27
|
+
def string_ngrams(str, range=2..3, normalized: false)
|
28
|
+
ngrams(get_words(str, normalized: normalized), range)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Keeps the start order of the `words` of the input `Array` `words`.
|
32
|
+
# It does **not** keep consecutive `words` together (it can jump/skip items).
|
33
|
+
# @param str [String] the input string with the words.
|
34
|
+
# @param range [Integer, Range] determine the lenght of the generated values.
|
35
|
+
# @return [Array<String>] combinations of `range` length of `words`
|
36
|
+
def string_combinations(str, range=2..3, normalized: false)
|
37
|
+
combinations(get_words(str, normalized: normalized), range)
|
38
|
+
.map {|comb| comb.join(' ')}
|
39
|
+
end
|
40
|
+
|
41
|
+
# It includes `combinations` that break the initial order of the `Array`.
|
42
|
+
# It does **not** keep consecutive `words` together (it can jump/skip items).
|
43
|
+
# @param str [String] the input string with the words.
|
44
|
+
# @param range [Integer, Range] determine the lenght of the generated values.
|
45
|
+
# @return [Array<String>] permutations of `range` length of `words`
|
46
|
+
def string_permutations(str, range=2..3, normalized: false)
|
47
|
+
permutations(get_words(str, normalized: normalized), range)
|
48
|
+
.map {|comb| comb.join(' ')}
|
49
|
+
end
|
50
|
+
|
51
|
+
# Keeps the start order of the `charts` and consecutive `charts` together/consecutive.
|
52
|
+
# @param str [String] the input `word` string.
|
53
|
+
# @param range [Integer, Range] determine the lenght of the generated values.
|
54
|
+
# @return [Array<String>] combinations of `range` length of `words`.
|
55
|
+
def word_ngrams(str, range=2..3, normalized: false)
|
56
|
+
str = normalize_string(str) unless normalized
|
57
|
+
ngrams(str.to_s.chars, range)
|
58
|
+
.map {|comb| no_blanks(comb)}
|
59
|
+
end
|
60
|
+
|
61
|
+
def no_blanks(str)
|
62
|
+
return nil unless str && str.is_a?(String)
|
63
|
+
str.tr(' ', '')
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|