eco-helpers 2.0.19 → 2.0.21
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -1
- data/eco-helpers.gemspec +5 -1
- data/lib/eco/api/common/loaders/parser.rb +1 -0
- data/lib/eco/api/common/people/entries.rb +1 -0
- data/lib/eco/api/common/people/entry_factory.rb +49 -15
- data/lib/eco/api/common/version_patches/exception.rb +5 -2
- data/lib/eco/api/organization/people.rb +2 -2
- data/lib/eco/api/organization/people_similarity.rb +171 -11
- data/lib/eco/api/organization/tag_tree.rb +33 -0
- data/lib/eco/api/session.rb +4 -2
- data/lib/eco/api/usecases/default_cases.rb +1 -0
- data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +189 -19
- data/lib/eco/api/usecases/default_cases/clean_unknown_tags_case.rb +37 -0
- data/lib/eco/cli/config/default/options.rb +29 -1
- data/lib/eco/cli/config/default/people.rb +18 -24
- data/lib/eco/cli/config/default/usecases.rb +31 -2
- data/lib/eco/cli/config/default/workflow.rb +7 -5
- data/lib/eco/csv/table.rb +121 -21
- data/lib/eco/data/fuzzy_match.rb +52 -12
- data/lib/eco/data/fuzzy_match/chars_position_score.rb +3 -2
- data/lib/eco/data/fuzzy_match/ngrams_score.rb +13 -9
- data/lib/eco/data/fuzzy_match/pairing.rb +12 -18
- data/lib/eco/data/fuzzy_match/result.rb +15 -1
- data/lib/eco/data/fuzzy_match/results.rb +18 -0
- data/lib/eco/data/fuzzy_match/score.rb +12 -7
- data/lib/eco/data/fuzzy_match/string_helpers.rb +14 -1
- data/lib/eco/version.rb +1 -1
- metadata +83 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5525ed41d4f4b42d96eb0d71f02911e7cfb9112890aff984fb872a60d7515976
|
4
|
+
data.tar.gz: d6dacbda91325cea0867c253b99458ae18b16e6450431ead852d686dd6e56859
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1f4a164e153ac8e3d75bfc7024b2ee7c6c101a1703fb79b58a0d8c8fc31fc9978b6353c2a178a419c94ca22e6b49bf440364706642e7b087a4601f4cd7da3b51
|
7
|
+
data.tar.gz: 9fdd7340a79b853b0dd51676e9ea2ba773928c68928b62d7781a525b2c27c1032ec7da04cae1f57afb7b290649350280d0aa5d0f2e7b91ba8a46cfe685bbeadb
|
data/CHANGELOG.md
CHANGED
@@ -1,7 +1,33 @@
|
|
1
1
|
# Change Log
|
2
2
|
All notable changes to this project will be documented in this file.
|
3
3
|
|
4
|
-
## [2.0.
|
4
|
+
## [2.0.21] - 2021-06-0x
|
5
|
+
|
6
|
+
### Added
|
7
|
+
- `Eco::CSV::Table`, support to create the table out of an `Array<Hash>`
|
8
|
+
- This opens new methods to transform input Excel file to this data structure and unify input data structures.
|
9
|
+
- **new** use case `Eco::API::UseCases::DefaultCases::CleanUnknownTags` invokable via `clean-unknown-tags`
|
10
|
+
|
11
|
+
### Changed
|
12
|
+
- `Eco::API::Common::People::EntryFactory` slight **refactor** to boost better support for multiple input formats
|
13
|
+
|
14
|
+
### Fixed
|
15
|
+
|
16
|
+
|
17
|
+
## [2.0.20] - 2021-05-31
|
18
|
+
|
19
|
+
### Added
|
20
|
+
- **dependencies** to `creek`, `roo` and `roo-xls`
|
21
|
+
- **dependencies** to `hashdiff`
|
22
|
+
- `Eco::API::Session#parse_attribute` => added missing parameter `deps:`
|
23
|
+
- new option `-stdout [file]` to redirect the output to a file
|
24
|
+
- `Eco::CSV::Table`, **added** more helper methods `#group_by`, `#transform_values`, `#slice`, `#slice_columns`, `#delete_column`
|
25
|
+
- `Eco::API::Organization::TagTree` **added** more helper methods: `top?`, `tag=`, `as_json`, `dup`, `diff`
|
26
|
+
|
27
|
+
### Fixed
|
28
|
+
- `Exception` patch: when `SystemStackError` there is not `backtrace` :/
|
29
|
+
|
30
|
+
## [2.0.19] - 2021-05-31
|
5
31
|
|
6
32
|
### Added
|
7
33
|
- Better error message for people searches & **offer** to select among the candidates:
|
data/eco-helpers.gemspec
CHANGED
@@ -14,7 +14,7 @@ Gem::Specification.new do |spec|
|
|
14
14
|
spec.homepage = "https://www.ecoportal.com"
|
15
15
|
spec.licenses = %w[MIT]
|
16
16
|
|
17
|
-
spec.required_ruby_version = '>= 2.
|
17
|
+
spec.required_ruby_version = '>= 2.5.0'
|
18
18
|
|
19
19
|
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
20
20
|
f.match(%r{^(test|spec|features)/})
|
@@ -36,7 +36,11 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.add_dependency 'aws-sdk-ses', '>= 1.36.0', '< 2'
|
37
37
|
spec.add_dependency 'dotenv', '>= 2.7.6', '< 2.8'
|
38
38
|
spec.add_dependency 'net-sftp', '>= 3.0.0', '< 3.1'
|
39
|
+
spec.add_dependency 'hashdiff', '>= 1.0.1', '< 1.1'
|
39
40
|
spec.add_dependency 'fuzzy_match', '>= 2.1.0', '< 2.2'
|
40
41
|
spec.add_dependency 'amatch', '>= 0.4.0', '< 0.5'
|
41
42
|
spec.add_dependency 'jaro_winkler', '>= 1.5.4', '< 1.6'
|
43
|
+
spec.add_dependency 'roo', '>= 2.8.3', '< 2.9'
|
44
|
+
spec.add_dependency 'roo-xls', '>= 1.2.0', '< 1.3'
|
45
|
+
spec.add_dependency 'creek', '>= 2.5.2', '< 2.6'
|
42
46
|
end
|
@@ -2,7 +2,7 @@ module Eco
|
|
2
2
|
module API
|
3
3
|
module Common
|
4
4
|
module People
|
5
|
-
# TODO: EntryFactory should suppport multiple schemas itself
|
5
|
+
# TODO: EntryFactory should suppport multiple schemas itself (rather that being done on `Session`)
|
6
6
|
# => currently, it's through session.entry_factory(schema: id), but this is wrong
|
7
7
|
# => This way, Entries and PersonEntry will be able to refer to attr_map and person_parser linked to schema_id
|
8
8
|
# => "schema_id" should be an optional column in the input file, or parsable via a custom parser to scope the schema
|
@@ -88,26 +88,49 @@ module Eco
|
|
88
88
|
fatal("Format should be a Symbol. Given '#{format}'") if format && !format.is_a?(Symbol)
|
89
89
|
fatal("There is no parser/serializer for format ':#{format.to_s}'") unless no_format || @person_parser.defined?(format)
|
90
90
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
91
|
+
kargs = {}
|
92
|
+
kargs.merge!(content: data) unless no_data
|
93
|
+
kargs.merge!(file: file) unless no_file
|
94
|
+
kargs.merge!(format: format) unless no_format
|
95
|
+
kargs.merge!(encoding: encoding) if encoding
|
96
|
+
|
97
|
+
Entries.new(to_array_of_hashes(**kargs), klass: PersonEntry, factory: self)
|
98
|
+
end
|
99
|
+
|
100
|
+
def to_array_of_hashes(**kargs)
|
101
|
+
data = []
|
102
|
+
content, file, encoding, format = kargs.values_at(:content, :file, :encoding, :format)
|
103
|
+
|
104
|
+
content = get_file_content(file, format, encoding) if file
|
105
|
+
|
106
|
+
case content
|
107
|
+
when !content
|
108
|
+
logger.error("Could not obtain any data out of these: #{kargs}")
|
109
|
+
exit(1)
|
110
|
+
when Hash
|
111
|
+
logger.error("Input data as 'Hash' not supported. Expecting 'Enumerable' or 'String'")
|
112
|
+
exit(1)
|
113
|
+
when String
|
114
|
+
data = person_parser.parse(format, content).map.each_with_index do |entry_hash, i|
|
115
|
+
j = (format == :csv)? i + 2 : i + 1
|
116
|
+
entry_hash.tap {|hash| hash["idx"] = j}
|
117
|
+
end
|
118
|
+
to_array_of_hashes(content: data)
|
119
|
+
when Enumerable
|
120
|
+
sample = content.to_a.first
|
121
|
+
case sample
|
122
|
+
when Hash, Array, ::CSV::Row
|
123
|
+
Eco::CSV::Table.new(content).to_array_of_hashes
|
101
124
|
else
|
102
|
-
logger.
|
125
|
+
logger.error("Input 'Array' of '#{sample.class}' is not supported.")
|
103
126
|
end
|
104
|
-
|
105
|
-
entries(data: arr_hash)
|
106
127
|
else
|
107
|
-
|
128
|
+
logger.error("Could not obtain any data out of content: '#{content.class}'")
|
129
|
+
exit(1)
|
108
130
|
end
|
109
131
|
end
|
110
132
|
|
133
|
+
|
111
134
|
# Helper that generates a file out of `data:`.
|
112
135
|
# @raise Exception
|
113
136
|
# - if you try to provide `data:` in the wrong format.
|
@@ -150,6 +173,17 @@ module Eco
|
|
150
173
|
|
151
174
|
private
|
152
175
|
|
176
|
+
def get_file_content(file, format, encoding)
|
177
|
+
unless Eco::API::Common::Session::FileManager.file_exists?(file)
|
178
|
+
logger.error("File does not exist: #{file}")
|
179
|
+
exit(1)
|
180
|
+
end
|
181
|
+
ext = File.extname(file)
|
182
|
+
encoding ||= Eco::API::Common::Session::FileManager.encoding(file)
|
183
|
+
encoding = (encoding != "utf-8")? "#{encoding}|utf-8": encoding
|
184
|
+
content = File.read(file, encoding: encoding)
|
185
|
+
end
|
186
|
+
|
153
187
|
def fatal(msg)
|
154
188
|
logger.fatal(msg)
|
155
189
|
raise msg
|
@@ -2,8 +2,11 @@ class ::Exception
|
|
2
2
|
def patch_full_message
|
3
3
|
begin
|
4
4
|
msg = []
|
5
|
-
|
6
|
-
|
5
|
+
tracing = backtrace ? backtrace : []
|
6
|
+
tracing = (self.class == SystemStackError) ? tracing[1..30] : tracing[1..-1]
|
7
|
+
tracing ||= []
|
8
|
+
msg << "\n#{tracing.first} \n#{message} (#{self.class.to_s})"
|
9
|
+
tracing.each_with_index {|bt, i| msg << "#{" "*8}#{i+1}: from #{bt}"}
|
7
10
|
msg.join("\n")
|
8
11
|
rescue Exception => e
|
9
12
|
puts "Something is wrong with 'patch_full_message': #{e}"
|
@@ -13,7 +13,19 @@ module Eco
|
|
13
13
|
# @!group Config
|
14
14
|
# @return [String, Proc, nil] the target attribute to be read.
|
15
15
|
def attribute=(attr)
|
16
|
-
@attribute
|
16
|
+
@attribute = attr
|
17
|
+
end
|
18
|
+
|
19
|
+
def attribute
|
20
|
+
@attribute ||= :name
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns the target value to analyse
|
24
|
+
# @param person [Ecoportal::API::V1::Person]
|
25
|
+
def item_value(person)
|
26
|
+
return attr.call(item) if attribute.is_a?(Proc)
|
27
|
+
attr = attribute.to_sym
|
28
|
+
return item.send(attr) if item.respond_to?(attr)
|
17
29
|
end
|
18
30
|
|
19
31
|
# Define the order or relevant of per user matches
|
@@ -37,6 +49,16 @@ module Eco
|
|
37
49
|
@threshold ||= 0.15
|
38
50
|
end
|
39
51
|
|
52
|
+
# Generates a new object with same config but different base `data`.
|
53
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
54
|
+
def newFrom(data)
|
55
|
+
super(data).tap do |simil|
|
56
|
+
simil.threshold = threshold
|
57
|
+
simil.order = order
|
58
|
+
simil.attribute = attribute
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
40
62
|
# @!endgroup
|
41
63
|
|
42
64
|
# @!group Searchers
|
@@ -50,36 +72,158 @@ module Eco
|
|
50
72
|
end
|
51
73
|
end
|
52
74
|
|
75
|
+
# It returns all people with no name
|
76
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
77
|
+
def unnamed
|
78
|
+
select do |person|
|
79
|
+
person.name.to_s.strip.length < 2
|
80
|
+
end.yield_self do |results|
|
81
|
+
newFrom(results)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# It returns all people with no name
|
86
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
87
|
+
def named
|
88
|
+
reject do |person|
|
89
|
+
person.name.to_s.strip.length < 2
|
90
|
+
end.yield_self do |results|
|
91
|
+
newFrom(results)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# It returns all the entries with `attribute` empty
|
96
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
97
|
+
def blank_attribute
|
98
|
+
select do |person|
|
99
|
+
item_value(person).to_s.strip.length < 2
|
100
|
+
end.yield_self do |results|
|
101
|
+
newFrom(results)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# It returns all the entries with `attribute` **n0t** empty
|
106
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
107
|
+
def attribute_present
|
108
|
+
reject do |person|
|
109
|
+
item_value(person).to_s.strip.length < 2
|
110
|
+
end.yield_self do |results|
|
111
|
+
newFrom(results)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
53
115
|
# @!endgroup
|
54
116
|
|
55
|
-
# @!group
|
117
|
+
# @!group Analisys starters
|
56
118
|
|
57
119
|
# Analyses People bases on `options`
|
120
|
+
# @param needle_read [Proc, Symbol] when the value to read from `needle` object is different to the `:read` (`attribute`).
|
121
|
+
# This allows to for example, facet `needle.name` (needle_read) against `haystack_item.details[alt_id]` (read).
|
122
|
+
# @param keep_empty [Boolean] to indicate if it should get rid of people with no results (based on threshold)
|
58
123
|
# @return [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results`
|
59
|
-
def analyse(**options)
|
124
|
+
def analyse(needle_read: nil, keep_empty: false, **options)
|
60
125
|
options = { read: self.attribute }.merge(options)
|
126
|
+
total = count; i = 1
|
61
127
|
each_with_object({}) do |person, results|
|
62
|
-
|
128
|
+
needle_str = needle_read ? item_string(person, needle_read) : nil
|
129
|
+
results[person.id] = find_all_with_score(person, needle_str: needle_str, **options)
|
130
|
+
print_progress("Analysed", total, i)
|
131
|
+
i += 1
|
132
|
+
end.yield_self do |analysed|
|
133
|
+
analysed = clean_empty(analysed) unless keep_empty
|
134
|
+
#puts "... #{analysed.count} results after cleaning empty"
|
135
|
+
analysed
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# @!endgroup
|
140
|
+
|
141
|
+
# @!group Results Treatment
|
142
|
+
|
143
|
+
# Gets a new instance object of this class, with only people in results
|
144
|
+
# @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
|
145
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
146
|
+
def newSimilarity(analysed)
|
147
|
+
newFrom(people_in_results(analysed))
|
148
|
+
end
|
149
|
+
|
150
|
+
def people_in_results(analysed)
|
151
|
+
analysed.each_with_object([]) do |(id, results), people|
|
152
|
+
related = results.each_with_object([self[id]]) do |result, related|
|
153
|
+
related << result.match
|
154
|
+
end
|
155
|
+
related.each {|person| people << person unless people.include?(person)}
|
63
156
|
end
|
64
157
|
end
|
65
158
|
|
159
|
+
# Removes from results those that do not have similar entries
|
160
|
+
def clean_empty(analysed)
|
161
|
+
analysed.select do |id, results|
|
162
|
+
!results.empty?
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# Helper to do some treatment fo the results
|
167
|
+
# @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
|
168
|
+
# @return [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
|
169
|
+
def with_analysed(analysed, keep_empty: false)
|
170
|
+
analysed.each_with_object({}) do |(id, results), reanalysed|
|
171
|
+
reanalysed[id] = yield(self[id], results)
|
172
|
+
end.yield_self do |reanalysed|
|
173
|
+
reanalysed = clean_empty(reanalysed) unless keep_empty
|
174
|
+
reanalysed
|
175
|
+
end.tap {|out| "with_analysed... returns #{out.count} records"}
|
176
|
+
end
|
177
|
+
|
66
178
|
# Launches a reanalyis on `analysed` based on `options`
|
67
179
|
# @param analysed [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results`
|
68
|
-
def
|
69
|
-
analysed
|
70
|
-
|
180
|
+
def rearrange(analysed, **options)
|
181
|
+
with_analysed(analysed) do |person, results|
|
182
|
+
results.relevant_results(**options)
|
71
183
|
end
|
72
184
|
end
|
73
185
|
|
74
|
-
#
|
186
|
+
# Reanalyses by using a block to treat the needle and item values
|
187
|
+
def reanalyse(analysed, msg: "Reanalysing", **options, &block)
|
188
|
+
options = { read: self.attribute }.merge(options)
|
189
|
+
total = analysed.count; i = 1
|
190
|
+
with_analysed(analysed) do |person, results|
|
191
|
+
print_progress(msg, total, i)
|
192
|
+
i += 1
|
193
|
+
recalculate_results(results, &block)
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
# Renalyses by ignoring matching words between the `needle` and those found in `results`
|
198
|
+
def ignore_matching_words(analysed, **options)
|
199
|
+
prompt = "Reanalysing by ignoring matching words"
|
200
|
+
reanalyse(analysed, msg: prompt, **options) do |needle_str, item_str, needle, item|
|
201
|
+
self.class.remove_matching_words(needle_str, item_str)
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
# Renalyses by ignoring matching words between the `needle` and those found in `results`
|
206
|
+
def ignore_matching_words_old(analysed, **options)
|
207
|
+
options = { read: self.attribute }.merge(options)
|
208
|
+
total = analysed.count; i = 1
|
209
|
+
with_analysed(analysed) do |person, results|
|
210
|
+
print_progress("Reanalysing by ignoring matching words", total, i)
|
211
|
+
i += 1
|
212
|
+
ignore_same_words_score(results, **options)
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
# @!endgroup
|
217
|
+
|
218
|
+
# @!group Reporting Helpers
|
75
219
|
|
76
220
|
# @return [String] well structured text
|
77
|
-
def
|
221
|
+
def report(analysed, format: :txt)
|
78
222
|
case
|
79
223
|
when format == :txt
|
80
224
|
analysed.each_with_object("") do |(id, results), out|
|
81
225
|
msg = results.results.map {|r| r.print}.join("\n ")
|
82
|
-
"
|
226
|
+
out << "#{self[id].identify}:\n " + msg + "\n"
|
83
227
|
end
|
84
228
|
end
|
85
229
|
end
|
@@ -91,7 +235,7 @@ module Eco
|
|
91
235
|
def print_analysis(**options)
|
92
236
|
analysed = options[:analysed] || results_with_false_positives.analyse(**options)
|
93
237
|
analysed.each_with_object({}) do |(id, results), out|
|
94
|
-
puts
|
238
|
+
puts report(analysed)
|
95
239
|
end
|
96
240
|
end
|
97
241
|
# @!endgroup
|
@@ -105,6 +249,22 @@ module Eco
|
|
105
249
|
|
106
250
|
private
|
107
251
|
|
252
|
+
def print_progress(msg, total, num)
|
253
|
+
return unless total > 10
|
254
|
+
puts "" unless num > 1
|
255
|
+
@print_msg_len ||= 0
|
256
|
+
percent = (100 * num.to_f / total).round(1)
|
257
|
+
msg = " #{msg}: #{percent}% (#{num} of #{total})\r"
|
258
|
+
@print_msg_len = msg.length unless @print_msg_len > msg.length
|
259
|
+
print msg
|
260
|
+
$stdout.flush
|
261
|
+
if percent > 99.9
|
262
|
+
sleep(0.2)
|
263
|
+
print "#{" " * @print_msg_len}\r"
|
264
|
+
$stdout.flush
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
108
268
|
|
109
269
|
end
|
110
270
|
end
|
@@ -42,6 +42,39 @@ module Eco
|
|
42
42
|
init_hashes
|
43
43
|
end
|
44
44
|
|
45
|
+
# Updates the tag of the current tree
|
46
|
+
def tag=(value)
|
47
|
+
@tag = value
|
48
|
+
end
|
49
|
+
|
50
|
+
# @return [Eco::API::Organization::TagTree]
|
51
|
+
def dup
|
52
|
+
self.class.new(as_json)
|
53
|
+
end
|
54
|
+
|
55
|
+
# @return [Array] with the differences
|
56
|
+
def diff(tagtree, differences: {}, level: 0, **options)
|
57
|
+
require 'hashdiff'
|
58
|
+
Hashdiff.diff(self.as_json, tagtree.as_json, **options.slice(:array_path, :similarity, :use_lcs))
|
59
|
+
end
|
60
|
+
|
61
|
+
def top?
|
62
|
+
depth == -1
|
63
|
+
end
|
64
|
+
|
65
|
+
# @return [Array[Hash]] where `Hash` is a `node` `{"tag" => TAG, "nodes": Array[Hash]}`
|
66
|
+
def as_json
|
67
|
+
nodes_json = nodes.map {|node| node.as_json}
|
68
|
+
if top?
|
69
|
+
nodes_json
|
70
|
+
else
|
71
|
+
{
|
72
|
+
"tag" => tag,
|
73
|
+
"nodes" => nodes_json
|
74
|
+
}
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
45
78
|
# @return [Boolean] `true` if there are tags in the node, `false` otherwise.
|
46
79
|
def empty?
|
47
80
|
@has_tags.empty?
|