eco-helpers 2.0.19 → 2.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 14260868c76936513a93d4d104eacebbd11e47ed05806d4102ee76196a300d2b
4
- data.tar.gz: 35784d03a18f89d2ce8bf5c4105e0eaa647dd10b4e1fee03897319d9ad838760
3
+ metadata.gz: 5525ed41d4f4b42d96eb0d71f02911e7cfb9112890aff984fb872a60d7515976
4
+ data.tar.gz: d6dacbda91325cea0867c253b99458ae18b16e6450431ead852d686dd6e56859
5
5
  SHA512:
6
- metadata.gz: 514d71e93bfa4fb854d9062be03306e154a4dfd184256ab03da30e2bf4bb2a45fb305efd5e2206821e522dc7cc0bfbc69e75285e3c6292dca78f359bd166f52a
7
- data.tar.gz: c99a424905916cef61333c18bb31726e90fb9759a4a1b747b72ab123f4bc09ecc6962205a966b3ea46aa1f0a4cc3dd7f72a8c53829cc764a7993d64ad45495bf
6
+ metadata.gz: 1f4a164e153ac8e3d75bfc7024b2ee7c6c101a1703fb79b58a0d8c8fc31fc9978b6353c2a178a419c94ca22e6b49bf440364706642e7b087a4601f4cd7da3b51
7
+ data.tar.gz: 9fdd7340a79b853b0dd51676e9ea2ba773928c68928b62d7781a525b2c27c1032ec7da04cae1f57afb7b290649350280d0aa5d0f2e7b91ba8a46cfe685bbeadb
data/CHANGELOG.md CHANGED
@@ -1,7 +1,33 @@
1
1
  # Change Log
2
2
  All notable changes to this project will be documented in this file.
3
3
 
4
- ## [2.0.19] - 2021-05-xx
4
+ ## [2.0.21] - 2021-06-0x
5
+
6
+ ### Added
7
+ - `Eco::CSV::Table`, support to create the table out of an `Array<Hash>`
8
+ - This opens new methods to transform input Excel file to this data structure and unify input data structures.
9
+ - **new** use case `Eco::API::UseCases::DefaultCases::CleanUnknownTags` invokable via `clean-unknown-tags`
10
+
11
+ ### Changed
12
+ - `Eco::API::Common::People::EntryFactory` slight **refactor** to boost better support for multiple input formats
13
+
14
+ ### Fixed
15
+
16
+
17
+ ## [2.0.20] - 2021-05-31
18
+
19
+ ### Added
20
+ - **dependencies** to `creek`, `roo` and `roo-xls`
21
+ - **dependencies** to `hashdiff`
22
+ - `Eco::API::Session#parse_attribute` => added missing parameter `deps:`
23
+ - new option `-stdout [file]` to redirect the output to a file
24
+ - `Eco::CSV::Table`, **added** more helper methods `#group_by`, `#transform_values`, `#slice`, `#slice_columns`, `#delete_column`
25
+ - `Eco::API::Organization::TagTree` **added** more helper methods: `top?`, `tag=`, `as_json`, `dup`, `diff`
26
+
27
+ ### Fixed
28
+ - `Exception` patch: when `SystemStackError` there is not `backtrace` :/
29
+
30
+ ## [2.0.19] - 2021-05-31
5
31
 
6
32
  ### Added
7
33
  - Better error message for people searches & **offer** to select among the candidates:
data/eco-helpers.gemspec CHANGED
@@ -14,7 +14,7 @@ Gem::Specification.new do |spec|
14
14
  spec.homepage = "https://www.ecoportal.com"
15
15
  spec.licenses = %w[MIT]
16
16
 
17
- spec.required_ruby_version = '>= 2.4.4'
17
+ spec.required_ruby_version = '>= 2.5.0'
18
18
 
19
19
  spec.files = `git ls-files -z`.split("\x0").reject do |f|
20
20
  f.match(%r{^(test|spec|features)/})
@@ -36,7 +36,11 @@ Gem::Specification.new do |spec|
36
36
  spec.add_dependency 'aws-sdk-ses', '>= 1.36.0', '< 2'
37
37
  spec.add_dependency 'dotenv', '>= 2.7.6', '< 2.8'
38
38
  spec.add_dependency 'net-sftp', '>= 3.0.0', '< 3.1'
39
+ spec.add_dependency 'hashdiff', '>= 1.0.1', '< 1.1'
39
40
  spec.add_dependency 'fuzzy_match', '>= 2.1.0', '< 2.2'
40
41
  spec.add_dependency 'amatch', '>= 0.4.0', '< 0.5'
41
42
  spec.add_dependency 'jaro_winkler', '>= 1.5.4', '< 1.6'
43
+ spec.add_dependency 'roo', '>= 2.8.3', '< 2.9'
44
+ spec.add_dependency 'roo-xls', '>= 1.2.0', '< 1.3'
45
+ spec.add_dependency 'creek', '>= 2.5.2', '< 2.6'
42
46
  end
@@ -19,6 +19,7 @@ module Eco
19
19
  @attribute = value
20
20
  end
21
21
 
22
+ # TODO: it migh rather merge?
22
23
  # Some parsers require dependencies to do their job.
23
24
  def dependencies(value = nil)
24
25
  @dependencies ||= {}
@@ -99,6 +99,7 @@ module Eco
99
99
  newFrom to_a - discarded
100
100
  end
101
101
 
102
+ # TODO: it should rather use the the people-to-csv case somehow
102
103
  # Helper to dump the entries into a CSV
103
104
  # @param filename [String] the destination file
104
105
  def export(filename)
@@ -2,7 +2,7 @@ module Eco
2
2
  module API
3
3
  module Common
4
4
  module People
5
- # TODO: EntryFactory should suppport multiple schemas itself
5
+ # TODO: EntryFactory should suppport multiple schemas itself (rather that being done on `Session`)
6
6
  # => currently, it's through session.entry_factory(schema: id), but this is wrong
7
7
  # => This way, Entries and PersonEntry will be able to refer to attr_map and person_parser linked to schema_id
8
8
  # => "schema_id" should be an optional column in the input file, or parsable via a custom parser to scope the schema
@@ -88,26 +88,49 @@ module Eco
88
88
  fatal("Format should be a Symbol. Given '#{format}'") if format && !format.is_a?(Symbol)
89
89
  fatal("There is no parser/serializer for format ':#{format.to_s}'") unless no_format || @person_parser.defined?(format)
90
90
 
91
- if file
92
- arr_hash = []
93
- if Eco::API::Common::Session::FileManager.file_exists?(file)
94
- encoding ||= Eco::API::Common::Session::FileManager.encoding(file)
95
- encoding = (encoding != "utf-8")? "#{encoding}|utf-8": encoding
96
- file_content = File.read(file, encoding: encoding)
97
- arr_hash = person_parser.parse(format, file_content).map.each_with_index do |entry_hash, i|
98
- j = (format == :csv)? i + 2 : i + 1
99
- entry_hash.tap {|hash| hash["idx"] = j}
100
- end
91
+ kargs = {}
92
+ kargs.merge!(content: data) unless no_data
93
+ kargs.merge!(file: file) unless no_file
94
+ kargs.merge!(format: format) unless no_format
95
+ kargs.merge!(encoding: encoding) if encoding
96
+
97
+ Entries.new(to_array_of_hashes(**kargs), klass: PersonEntry, factory: self)
98
+ end
99
+
100
+ def to_array_of_hashes(**kargs)
101
+ data = []
102
+ content, file, encoding, format = kargs.values_at(:content, :file, :encoding, :format)
103
+
104
+ content = get_file_content(file, format, encoding) if file
105
+
106
+ case content
107
+ when !content
108
+ logger.error("Could not obtain any data out of these: #{kargs}")
109
+ exit(1)
110
+ when Hash
111
+ logger.error("Input data as 'Hash' not supported. Expecting 'Enumerable' or 'String'")
112
+ exit(1)
113
+ when String
114
+ data = person_parser.parse(format, content).map.each_with_index do |entry_hash, i|
115
+ j = (format == :csv)? i + 2 : i + 1
116
+ entry_hash.tap {|hash| hash["idx"] = j}
117
+ end
118
+ to_array_of_hashes(content: data)
119
+ when Enumerable
120
+ sample = content.to_a.first
121
+ case sample
122
+ when Hash, Array, ::CSV::Row
123
+ Eco::CSV::Table.new(content).to_array_of_hashes
101
124
  else
102
- logger.warn("File does not exist: #{file}")
125
+ logger.error("Input 'Array' of '#{sample.class}' is not supported.")
103
126
  end
104
-
105
- entries(data: arr_hash)
106
127
  else
107
- Entries.new(data, klass: PersonEntry, factory: self)
128
+ logger.error("Could not obtain any data out of content: '#{content.class}'")
129
+ exit(1)
108
130
  end
109
131
  end
110
132
 
133
+
111
134
  # Helper that generates a file out of `data:`.
112
135
  # @raise Exception
113
136
  # - if you try to provide `data:` in the wrong format.
@@ -150,6 +173,17 @@ module Eco
150
173
 
151
174
  private
152
175
 
176
+ def get_file_content(file, format, encoding)
177
+ unless Eco::API::Common::Session::FileManager.file_exists?(file)
178
+ logger.error("File does not exist: #{file}")
179
+ exit(1)
180
+ end
181
+ ext = File.extname(file)
182
+ encoding ||= Eco::API::Common::Session::FileManager.encoding(file)
183
+ encoding = (encoding != "utf-8")? "#{encoding}|utf-8": encoding
184
+ content = File.read(file, encoding: encoding)
185
+ end
186
+
153
187
  def fatal(msg)
154
188
  logger.fatal(msg)
155
189
  raise msg
@@ -2,8 +2,11 @@ class ::Exception
2
2
  def patch_full_message
3
3
  begin
4
4
  msg = []
5
- msg << "\n#{backtrace.first} \n#{message} (#{self.class.to_s})"
6
- backtrace[1..-1].each_with_index {|bt, i| msg << "#{" "*8}#{i+1}: from #{bt}"}
5
+ tracing = backtrace ? backtrace : []
6
+ tracing = (self.class == SystemStackError) ? tracing[1..30] : tracing[1..-1]
7
+ tracing ||= []
8
+ msg << "\n#{tracing.first} \n#{message} (#{self.class.to_s})"
9
+ tracing.each_with_index {|bt, i| msg << "#{" "*8}#{i+1}: from #{bt}"}
7
10
  msg.join("\n")
8
11
  rescue Exception => e
9
12
  puts "Something is wrong with 'patch_full_message': #{e}"
@@ -216,8 +216,8 @@ module Eco
216
216
  # @!endgroup
217
217
 
218
218
  # @!group Helper methods
219
- def analytics
220
- Eco::API::Organization::PeopleAnalytics.new(self.to_a)
219
+ def similarity
220
+ Eco::API::Organization::PeopleSimilarity.new(self.to_a)
221
221
  end
222
222
  # @!endgroup
223
223
 
@@ -13,7 +13,19 @@ module Eco
13
13
  # @!group Config
14
14
  # @return [String, Proc, nil] the target attribute to be read.
15
15
  def attribute=(attr)
16
- @attribute ||= "name"
16
+ @attribute = attr
17
+ end
18
+
19
+ def attribute
20
+ @attribute ||= :name
21
+ end
22
+
23
+ # Returns the target value to analyse
24
+ # @param person [Ecoportal::API::V1::Person]
25
+ def item_value(person)
26
+ return attr.call(item) if attribute.is_a?(Proc)
27
+ attr = attribute.to_sym
28
+ return item.send(attr) if item.respond_to?(attr)
17
29
  end
18
30
 
19
31
  # Define the order or relevant of per user matches
@@ -37,6 +49,16 @@ module Eco
37
49
  @threshold ||= 0.15
38
50
  end
39
51
 
52
+ # Generates a new object with same config but different base `data`.
53
+ # @return [Eco::API::Organization::PeopleSimilarity]
54
+ def newFrom(data)
55
+ super(data).tap do |simil|
56
+ simil.threshold = threshold
57
+ simil.order = order
58
+ simil.attribute = attribute
59
+ end
60
+ end
61
+
40
62
  # @!endgroup
41
63
 
42
64
  # @!group Searchers
@@ -50,36 +72,158 @@ module Eco
50
72
  end
51
73
  end
52
74
 
75
+ # It returns all people with no name
76
+ # @return [Eco::API::Organization::PeopleSimilarity]
77
+ def unnamed
78
+ select do |person|
79
+ person.name.to_s.strip.length < 2
80
+ end.yield_self do |results|
81
+ newFrom(results)
82
+ end
83
+ end
84
+
85
+ # It returns all people with no name
86
+ # @return [Eco::API::Organization::PeopleSimilarity]
87
+ def named
88
+ reject do |person|
89
+ person.name.to_s.strip.length < 2
90
+ end.yield_self do |results|
91
+ newFrom(results)
92
+ end
93
+ end
94
+
95
+ # It returns all the entries with `attribute` empty
96
+ # @return [Eco::API::Organization::PeopleSimilarity]
97
+ def blank_attribute
98
+ select do |person|
99
+ item_value(person).to_s.strip.length < 2
100
+ end.yield_self do |results|
101
+ newFrom(results)
102
+ end
103
+ end
104
+
105
+ # It returns all the entries with `attribute` **n0t** empty
106
+ # @return [Eco::API::Organization::PeopleSimilarity]
107
+ def attribute_present
108
+ reject do |person|
109
+ item_value(person).to_s.strip.length < 2
110
+ end.yield_self do |results|
111
+ newFrom(results)
112
+ end
113
+ end
114
+
53
115
  # @!endgroup
54
116
 
55
- # @!group Analysers
117
+ # @!group Analisys starters
56
118
 
57
119
  # Analyses People bases on `options`
120
+ # @param needle_read [Proc, Symbol] when the value to read from `needle` object is different to the `:read` (`attribute`).
121
+ # This allows to for example, facet `needle.name` (needle_read) against `haystack_item.details[alt_id]` (read).
122
+ # @param keep_empty [Boolean] to indicate if it should get rid of people with no results (based on threshold)
58
123
  # @return [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results`
59
- def analyse(**options)
124
+ def analyse(needle_read: nil, keep_empty: false, **options)
60
125
  options = { read: self.attribute }.merge(options)
126
+ total = count; i = 1
61
127
  each_with_object({}) do |person, results|
62
- results[person.id] = find_all_with_score(person, **options)
128
+ needle_str = needle_read ? item_string(person, needle_read) : nil
129
+ results[person.id] = find_all_with_score(person, needle_str: needle_str, **options)
130
+ print_progress("Analysed", total, i)
131
+ i += 1
132
+ end.yield_self do |analysed|
133
+ analysed = clean_empty(analysed) unless keep_empty
134
+ #puts "... #{analysed.count} results after cleaning empty"
135
+ analysed
136
+ end
137
+ end
138
+
139
+ # @!endgroup
140
+
141
+ # @!group Results Treatment
142
+
143
+ # Gets a new instance object of this class, with only people in results
144
+ # @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
145
+ # @return [Eco::API::Organization::PeopleSimilarity]
146
+ def newSimilarity(analysed)
147
+ newFrom(people_in_results(analysed))
148
+ end
149
+
150
+ def people_in_results(analysed)
151
+ analysed.each_with_object([]) do |(id, results), people|
152
+ related = results.each_with_object([self[id]]) do |result, related|
153
+ related << result.match
154
+ end
155
+ related.each {|person| people << person unless people.include?(person)}
63
156
  end
64
157
  end
65
158
 
159
+ # Removes from results those that do not have similar entries
160
+ def clean_empty(analysed)
161
+ analysed.select do |id, results|
162
+ !results.empty?
163
+ end
164
+ end
165
+
166
+ # Helper to do some treatment fo the results
167
+ # @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
168
+ # @return [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
169
+ def with_analysed(analysed, keep_empty: false)
170
+ analysed.each_with_object({}) do |(id, results), reanalysed|
171
+ reanalysed[id] = yield(self[id], results)
172
+ end.yield_self do |reanalysed|
173
+ reanalysed = clean_empty(reanalysed) unless keep_empty
174
+ reanalysed
175
+ end.tap {|out| "with_analysed... returns #{out.count} records"}
176
+ end
177
+
66
178
  # Launches a reanalyis on `analysed` based on `options`
67
179
  # @param analysed [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results`
68
- def re_analyse(analysed, **options)
69
- analysed.each_with_object({}) do |(id, results), out|
70
- out[id] = results.relevant_results(**options)
180
+ def rearrange(analysed, **options)
181
+ with_analysed(analysed) do |person, results|
182
+ results.relevant_results(**options)
71
183
  end
72
184
  end
73
185
 
74
- # @!group Helpers
186
+ # Reanalyses by using a block to treat the needle and item values
187
+ def reanalyse(analysed, msg: "Reanalysing", **options, &block)
188
+ options = { read: self.attribute }.merge(options)
189
+ total = analysed.count; i = 1
190
+ with_analysed(analysed) do |person, results|
191
+ print_progress(msg, total, i)
192
+ i += 1
193
+ recalculate_results(results, &block)
194
+ end
195
+ end
196
+
197
+ # Renalyses by ignoring matching words between the `needle` and those found in `results`
198
+ def ignore_matching_words(analysed, **options)
199
+ prompt = "Reanalysing by ignoring matching words"
200
+ reanalyse(analysed, msg: prompt, **options) do |needle_str, item_str, needle, item|
201
+ self.class.remove_matching_words(needle_str, item_str)
202
+ end
203
+ end
204
+
205
+ # Renalyses by ignoring matching words between the `needle` and those found in `results`
206
+ def ignore_matching_words_old(analysed, **options)
207
+ options = { read: self.attribute }.merge(options)
208
+ total = analysed.count; i = 1
209
+ with_analysed(analysed) do |person, results|
210
+ print_progress("Reanalysing by ignoring matching words", total, i)
211
+ i += 1
212
+ ignore_same_words_score(results, **options)
213
+ end
214
+ end
215
+
216
+ # @!endgroup
217
+
218
+ # @!group Reporting Helpers
75
219
 
76
220
  # @return [String] well structured text
77
- def analysis(analysed, format: :txt)
221
+ def report(analysed, format: :txt)
78
222
  case
79
223
  when format == :txt
80
224
  analysed.each_with_object("") do |(id, results), out|
81
225
  msg = results.results.map {|r| r.print}.join("\n ")
82
- "'#{self[id].identify}':\n " + msg
226
+ out << "#{self[id].identify}:\n " + msg + "\n"
83
227
  end
84
228
  end
85
229
  end
@@ -91,7 +235,7 @@ module Eco
91
235
  def print_analysis(**options)
92
236
  analysed = options[:analysed] || results_with_false_positives.analyse(**options)
93
237
  analysed.each_with_object({}) do |(id, results), out|
94
- puts analysis(analysed)
238
+ puts report(analysed)
95
239
  end
96
240
  end
97
241
  # @!endgroup
@@ -105,6 +249,22 @@ module Eco
105
249
 
106
250
  private
107
251
 
252
+ def print_progress(msg, total, num)
253
+ return unless total > 10
254
+ puts "" unless num > 1
255
+ @print_msg_len ||= 0
256
+ percent = (100 * num.to_f / total).round(1)
257
+ msg = " #{msg}: #{percent}% (#{num} of #{total})\r"
258
+ @print_msg_len = msg.length unless @print_msg_len > msg.length
259
+ print msg
260
+ $stdout.flush
261
+ if percent > 99.9
262
+ sleep(0.2)
263
+ print "#{" " * @print_msg_len}\r"
264
+ $stdout.flush
265
+ end
266
+ end
267
+
108
268
 
109
269
  end
110
270
  end
@@ -42,6 +42,39 @@ module Eco
42
42
  init_hashes
43
43
  end
44
44
 
45
+ # Updates the tag of the current tree
46
+ def tag=(value)
47
+ @tag = value
48
+ end
49
+
50
+ # @return [Eco::API::Organization::TagTree]
51
+ def dup
52
+ self.class.new(as_json)
53
+ end
54
+
55
+ # @return [Array] with the differences
56
+ def diff(tagtree, differences: {}, level: 0, **options)
57
+ require 'hashdiff'
58
+ Hashdiff.diff(self.as_json, tagtree.as_json, **options.slice(:array_path, :similarity, :use_lcs))
59
+ end
60
+
61
+ def top?
62
+ depth == -1
63
+ end
64
+
65
+ # @return [Array[Hash]] where `Hash` is a `node` `{"tag" => TAG, "nodes": Array[Hash]}`
66
+ def as_json
67
+ nodes_json = nodes.map {|node| node.as_json}
68
+ if top?
69
+ nodes_json
70
+ else
71
+ {
72
+ "tag" => tag,
73
+ "nodes" => nodes_json
74
+ }
75
+ end
76
+ end
77
+
45
78
  # @return [Boolean] `true` if there are tags in the node, `false` otherwise.
46
79
  def empty?
47
80
  @has_tags.empty?