eco-helpers 2.0.19 → 2.0.21

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 14260868c76936513a93d4d104eacebbd11e47ed05806d4102ee76196a300d2b
4
- data.tar.gz: 35784d03a18f89d2ce8bf5c4105e0eaa647dd10b4e1fee03897319d9ad838760
3
+ metadata.gz: 5525ed41d4f4b42d96eb0d71f02911e7cfb9112890aff984fb872a60d7515976
4
+ data.tar.gz: d6dacbda91325cea0867c253b99458ae18b16e6450431ead852d686dd6e56859
5
5
  SHA512:
6
- metadata.gz: 514d71e93bfa4fb854d9062be03306e154a4dfd184256ab03da30e2bf4bb2a45fb305efd5e2206821e522dc7cc0bfbc69e75285e3c6292dca78f359bd166f52a
7
- data.tar.gz: c99a424905916cef61333c18bb31726e90fb9759a4a1b747b72ab123f4bc09ecc6962205a966b3ea46aa1f0a4cc3dd7f72a8c53829cc764a7993d64ad45495bf
6
+ metadata.gz: 1f4a164e153ac8e3d75bfc7024b2ee7c6c101a1703fb79b58a0d8c8fc31fc9978b6353c2a178a419c94ca22e6b49bf440364706642e7b087a4601f4cd7da3b51
7
+ data.tar.gz: 9fdd7340a79b853b0dd51676e9ea2ba773928c68928b62d7781a525b2c27c1032ec7da04cae1f57afb7b290649350280d0aa5d0f2e7b91ba8a46cfe685bbeadb
data/CHANGELOG.md CHANGED
@@ -1,7 +1,33 @@
1
1
  # Change Log
2
2
  All notable changes to this project will be documented in this file.
3
3
 
4
- ## [2.0.19] - 2021-05-xx
4
+ ## [2.0.21] - 2021-06-0x
5
+
6
+ ### Added
7
+ - `Eco::CSV::Table`, support to create the table out of an `Array<Hash>`
8
+ - This opens new methods to transform input Excel file to this data structure and unify input data structures.
9
+ - **new** use case `Eco::API::UseCases::DefaultCases::CleanUnknownTags` invokable via `clean-unknown-tags`
10
+
11
+ ### Changed
12
+ - `Eco::API::Common::People::EntryFactory` slight **refactor** to boost better support for multiple input formats
13
+
14
+ ### Fixed
15
+
16
+
17
+ ## [2.0.20] - 2021-05-31
18
+
19
+ ### Added
20
+ - **dependencies** to `creek`, `roo` and `roo-xls`
21
+ - **dependencies** to `hashdiff`
22
+ - `Eco::API::Session#parse_attribute` => added missing parameter `deps:`
23
+ - new option `-stdout [file]` to redirect the output to a file
24
+ - `Eco::CSV::Table`, **added** more helper methods `#group_by`, `#transform_values`, `#slice`, `#slice_columns`, `#delete_column`
25
+ - `Eco::API::Organization::TagTree` **added** more helper methods: `top?`, `tag=`, `as_json`, `dup`, `diff`
26
+
27
+ ### Fixed
28
+ - `Exception` patch: when `SystemStackError` there is not `backtrace` :/
29
+
30
+ ## [2.0.19] - 2021-05-31
5
31
 
6
32
  ### Added
7
33
  - Better error message for people searches & **offer** to select among the candidates:
data/eco-helpers.gemspec CHANGED
@@ -14,7 +14,7 @@ Gem::Specification.new do |spec|
14
14
  spec.homepage = "https://www.ecoportal.com"
15
15
  spec.licenses = %w[MIT]
16
16
 
17
- spec.required_ruby_version = '>= 2.4.4'
17
+ spec.required_ruby_version = '>= 2.5.0'
18
18
 
19
19
  spec.files = `git ls-files -z`.split("\x0").reject do |f|
20
20
  f.match(%r{^(test|spec|features)/})
@@ -36,7 +36,11 @@ Gem::Specification.new do |spec|
36
36
  spec.add_dependency 'aws-sdk-ses', '>= 1.36.0', '< 2'
37
37
  spec.add_dependency 'dotenv', '>= 2.7.6', '< 2.8'
38
38
  spec.add_dependency 'net-sftp', '>= 3.0.0', '< 3.1'
39
+ spec.add_dependency 'hashdiff', '>= 1.0.1', '< 1.1'
39
40
  spec.add_dependency 'fuzzy_match', '>= 2.1.0', '< 2.2'
40
41
  spec.add_dependency 'amatch', '>= 0.4.0', '< 0.5'
41
42
  spec.add_dependency 'jaro_winkler', '>= 1.5.4', '< 1.6'
43
+ spec.add_dependency 'roo', '>= 2.8.3', '< 2.9'
44
+ spec.add_dependency 'roo-xls', '>= 1.2.0', '< 1.3'
45
+ spec.add_dependency 'creek', '>= 2.5.2', '< 2.6'
42
46
  end
@@ -19,6 +19,7 @@ module Eco
19
19
  @attribute = value
20
20
  end
21
21
 
22
+ # TODO: it migh rather merge?
22
23
  # Some parsers require dependencies to do their job.
23
24
  def dependencies(value = nil)
24
25
  @dependencies ||= {}
@@ -99,6 +99,7 @@ module Eco
99
99
  newFrom to_a - discarded
100
100
  end
101
101
 
102
+ # TODO: it should rather use the the people-to-csv case somehow
102
103
  # Helper to dump the entries into a CSV
103
104
  # @param filename [String] the destination file
104
105
  def export(filename)
@@ -2,7 +2,7 @@ module Eco
2
2
  module API
3
3
  module Common
4
4
  module People
5
- # TODO: EntryFactory should suppport multiple schemas itself
5
+ # TODO: EntryFactory should suppport multiple schemas itself (rather that being done on `Session`)
6
6
  # => currently, it's through session.entry_factory(schema: id), but this is wrong
7
7
  # => This way, Entries and PersonEntry will be able to refer to attr_map and person_parser linked to schema_id
8
8
  # => "schema_id" should be an optional column in the input file, or parsable via a custom parser to scope the schema
@@ -88,26 +88,49 @@ module Eco
88
88
  fatal("Format should be a Symbol. Given '#{format}'") if format && !format.is_a?(Symbol)
89
89
  fatal("There is no parser/serializer for format ':#{format.to_s}'") unless no_format || @person_parser.defined?(format)
90
90
 
91
- if file
92
- arr_hash = []
93
- if Eco::API::Common::Session::FileManager.file_exists?(file)
94
- encoding ||= Eco::API::Common::Session::FileManager.encoding(file)
95
- encoding = (encoding != "utf-8")? "#{encoding}|utf-8": encoding
96
- file_content = File.read(file, encoding: encoding)
97
- arr_hash = person_parser.parse(format, file_content).map.each_with_index do |entry_hash, i|
98
- j = (format == :csv)? i + 2 : i + 1
99
- entry_hash.tap {|hash| hash["idx"] = j}
100
- end
91
+ kargs = {}
92
+ kargs.merge!(content: data) unless no_data
93
+ kargs.merge!(file: file) unless no_file
94
+ kargs.merge!(format: format) unless no_format
95
+ kargs.merge!(encoding: encoding) if encoding
96
+
97
+ Entries.new(to_array_of_hashes(**kargs), klass: PersonEntry, factory: self)
98
+ end
99
+
100
+ def to_array_of_hashes(**kargs)
101
+ data = []
102
+ content, file, encoding, format = kargs.values_at(:content, :file, :encoding, :format)
103
+
104
+ content = get_file_content(file, format, encoding) if file
105
+
106
+ case content
107
+ when !content
108
+ logger.error("Could not obtain any data out of these: #{kargs}")
109
+ exit(1)
110
+ when Hash
111
+ logger.error("Input data as 'Hash' not supported. Expecting 'Enumerable' or 'String'")
112
+ exit(1)
113
+ when String
114
+ data = person_parser.parse(format, content).map.each_with_index do |entry_hash, i|
115
+ j = (format == :csv)? i + 2 : i + 1
116
+ entry_hash.tap {|hash| hash["idx"] = j}
117
+ end
118
+ to_array_of_hashes(content: data)
119
+ when Enumerable
120
+ sample = content.to_a.first
121
+ case sample
122
+ when Hash, Array, ::CSV::Row
123
+ Eco::CSV::Table.new(content).to_array_of_hashes
101
124
  else
102
- logger.warn("File does not exist: #{file}")
125
+ logger.error("Input 'Array' of '#{sample.class}' is not supported.")
103
126
  end
104
-
105
- entries(data: arr_hash)
106
127
  else
107
- Entries.new(data, klass: PersonEntry, factory: self)
128
+ logger.error("Could not obtain any data out of content: '#{content.class}'")
129
+ exit(1)
108
130
  end
109
131
  end
110
132
 
133
+
111
134
  # Helper that generates a file out of `data:`.
112
135
  # @raise Exception
113
136
  # - if you try to provide `data:` in the wrong format.
@@ -150,6 +173,17 @@ module Eco
150
173
 
151
174
  private
152
175
 
176
+ def get_file_content(file, format, encoding)
177
+ unless Eco::API::Common::Session::FileManager.file_exists?(file)
178
+ logger.error("File does not exist: #{file}")
179
+ exit(1)
180
+ end
181
+ ext = File.extname(file)
182
+ encoding ||= Eco::API::Common::Session::FileManager.encoding(file)
183
+ encoding = (encoding != "utf-8")? "#{encoding}|utf-8": encoding
184
+ content = File.read(file, encoding: encoding)
185
+ end
186
+
153
187
  def fatal(msg)
154
188
  logger.fatal(msg)
155
189
  raise msg
@@ -2,8 +2,11 @@ class ::Exception
2
2
  def patch_full_message
3
3
  begin
4
4
  msg = []
5
- msg << "\n#{backtrace.first} \n#{message} (#{self.class.to_s})"
6
- backtrace[1..-1].each_with_index {|bt, i| msg << "#{" "*8}#{i+1}: from #{bt}"}
5
+ tracing = backtrace ? backtrace : []
6
+ tracing = (self.class == SystemStackError) ? tracing[1..30] : tracing[1..-1]
7
+ tracing ||= []
8
+ msg << "\n#{tracing.first} \n#{message} (#{self.class.to_s})"
9
+ tracing.each_with_index {|bt, i| msg << "#{" "*8}#{i+1}: from #{bt}"}
7
10
  msg.join("\n")
8
11
  rescue Exception => e
9
12
  puts "Something is wrong with 'patch_full_message': #{e}"
@@ -216,8 +216,8 @@ module Eco
216
216
  # @!endgroup
217
217
 
218
218
  # @!group Helper methods
219
- def analytics
220
- Eco::API::Organization::PeopleAnalytics.new(self.to_a)
219
+ def similarity
220
+ Eco::API::Organization::PeopleSimilarity.new(self.to_a)
221
221
  end
222
222
  # @!endgroup
223
223
 
@@ -13,7 +13,19 @@ module Eco
13
13
  # @!group Config
14
14
  # @return [String, Proc, nil] the target attribute to be read.
15
15
  def attribute=(attr)
16
- @attribute ||= "name"
16
+ @attribute = attr
17
+ end
18
+
19
+ def attribute
20
+ @attribute ||= :name
21
+ end
22
+
23
+ # Returns the target value to analyse
24
+ # @param person [Ecoportal::API::V1::Person]
25
+ def item_value(person)
26
+ return attr.call(item) if attribute.is_a?(Proc)
27
+ attr = attribute.to_sym
28
+ return item.send(attr) if item.respond_to?(attr)
17
29
  end
18
30
 
19
31
  # Define the order or relevant of per user matches
@@ -37,6 +49,16 @@ module Eco
37
49
  @threshold ||= 0.15
38
50
  end
39
51
 
52
+ # Generates a new object with same config but different base `data`.
53
+ # @return [Eco::API::Organization::PeopleSimilarity]
54
+ def newFrom(data)
55
+ super(data).tap do |simil|
56
+ simil.threshold = threshold
57
+ simil.order = order
58
+ simil.attribute = attribute
59
+ end
60
+ end
61
+
40
62
  # @!endgroup
41
63
 
42
64
  # @!group Searchers
@@ -50,36 +72,158 @@ module Eco
50
72
  end
51
73
  end
52
74
 
75
+ # It returns all people with no name
76
+ # @return [Eco::API::Organization::PeopleSimilarity]
77
+ def unnamed
78
+ select do |person|
79
+ person.name.to_s.strip.length < 2
80
+ end.yield_self do |results|
81
+ newFrom(results)
82
+ end
83
+ end
84
+
85
+ # It returns all people with no name
86
+ # @return [Eco::API::Organization::PeopleSimilarity]
87
+ def named
88
+ reject do |person|
89
+ person.name.to_s.strip.length < 2
90
+ end.yield_self do |results|
91
+ newFrom(results)
92
+ end
93
+ end
94
+
95
+ # It returns all the entries with `attribute` empty
96
+ # @return [Eco::API::Organization::PeopleSimilarity]
97
+ def blank_attribute
98
+ select do |person|
99
+ item_value(person).to_s.strip.length < 2
100
+ end.yield_self do |results|
101
+ newFrom(results)
102
+ end
103
+ end
104
+
105
+ # It returns all the entries with `attribute` **n0t** empty
106
+ # @return [Eco::API::Organization::PeopleSimilarity]
107
+ def attribute_present
108
+ reject do |person|
109
+ item_value(person).to_s.strip.length < 2
110
+ end.yield_self do |results|
111
+ newFrom(results)
112
+ end
113
+ end
114
+
53
115
  # @!endgroup
54
116
 
55
- # @!group Analysers
117
+ # @!group Analisys starters
56
118
 
57
119
  # Analyses People bases on `options`
120
+ # @param needle_read [Proc, Symbol] when the value to read from `needle` object is different to the `:read` (`attribute`).
121
+ # This allows to for example, facet `needle.name` (needle_read) against `haystack_item.details[alt_id]` (read).
122
+ # @param keep_empty [Boolean] to indicate if it should get rid of people with no results (based on threshold)
58
123
  # @return [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results`
59
- def analyse(**options)
124
+ def analyse(needle_read: nil, keep_empty: false, **options)
60
125
  options = { read: self.attribute }.merge(options)
126
+ total = count; i = 1
61
127
  each_with_object({}) do |person, results|
62
- results[person.id] = find_all_with_score(person, **options)
128
+ needle_str = needle_read ? item_string(person, needle_read) : nil
129
+ results[person.id] = find_all_with_score(person, needle_str: needle_str, **options)
130
+ print_progress("Analysed", total, i)
131
+ i += 1
132
+ end.yield_self do |analysed|
133
+ analysed = clean_empty(analysed) unless keep_empty
134
+ #puts "... #{analysed.count} results after cleaning empty"
135
+ analysed
136
+ end
137
+ end
138
+
139
+ # @!endgroup
140
+
141
+ # @!group Results Treatment
142
+
143
+ # Gets a new instance object of this class, with only people in results
144
+ # @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
145
+ # @return [Eco::API::Organization::PeopleSimilarity]
146
+ def newSimilarity(analysed)
147
+ newFrom(people_in_results(analysed))
148
+ end
149
+
150
+ def people_in_results(analysed)
151
+ analysed.each_with_object([]) do |(id, results), people|
152
+ related = results.each_with_object([self[id]]) do |result, related|
153
+ related << result.match
154
+ end
155
+ related.each {|person| people << person unless people.include?(person)}
63
156
  end
64
157
  end
65
158
 
159
+ # Removes from results those that do not have similar entries
160
+ def clean_empty(analysed)
161
+ analysed.select do |id, results|
162
+ !results.empty?
163
+ end
164
+ end
165
+
166
+ # Helper to do some treatment fo the results
167
+ # @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
168
+ # @return [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
169
+ def with_analysed(analysed, keep_empty: false)
170
+ analysed.each_with_object({}) do |(id, results), reanalysed|
171
+ reanalysed[id] = yield(self[id], results)
172
+ end.yield_self do |reanalysed|
173
+ reanalysed = clean_empty(reanalysed) unless keep_empty
174
+ reanalysed
175
+ end.tap {|out| "with_analysed... returns #{out.count} records"}
176
+ end
177
+
66
178
  # Launches a reanalyis on `analysed` based on `options`
67
179
  # @param analysed [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results`
68
- def re_analyse(analysed, **options)
69
- analysed.each_with_object({}) do |(id, results), out|
70
- out[id] = results.relevant_results(**options)
180
+ def rearrange(analysed, **options)
181
+ with_analysed(analysed) do |person, results|
182
+ results.relevant_results(**options)
71
183
  end
72
184
  end
73
185
 
74
- # @!group Helpers
186
+ # Reanalyses by using a block to treat the needle and item values
187
+ def reanalyse(analysed, msg: "Reanalysing", **options, &block)
188
+ options = { read: self.attribute }.merge(options)
189
+ total = analysed.count; i = 1
190
+ with_analysed(analysed) do |person, results|
191
+ print_progress(msg, total, i)
192
+ i += 1
193
+ recalculate_results(results, &block)
194
+ end
195
+ end
196
+
197
+ # Renalyses by ignoring matching words between the `needle` and those found in `results`
198
+ def ignore_matching_words(analysed, **options)
199
+ prompt = "Reanalysing by ignoring matching words"
200
+ reanalyse(analysed, msg: prompt, **options) do |needle_str, item_str, needle, item|
201
+ self.class.remove_matching_words(needle_str, item_str)
202
+ end
203
+ end
204
+
205
+ # Renalyses by ignoring matching words between the `needle` and those found in `results`
206
+ def ignore_matching_words_old(analysed, **options)
207
+ options = { read: self.attribute }.merge(options)
208
+ total = analysed.count; i = 1
209
+ with_analysed(analysed) do |person, results|
210
+ print_progress("Reanalysing by ignoring matching words", total, i)
211
+ i += 1
212
+ ignore_same_words_score(results, **options)
213
+ end
214
+ end
215
+
216
+ # @!endgroup
217
+
218
+ # @!group Reporting Helpers
75
219
 
76
220
  # @return [String] well structured text
77
- def analysis(analysed, format: :txt)
221
+ def report(analysed, format: :txt)
78
222
  case
79
223
  when format == :txt
80
224
  analysed.each_with_object("") do |(id, results), out|
81
225
  msg = results.results.map {|r| r.print}.join("\n ")
82
- "'#{self[id].identify}':\n " + msg
226
+ out << "#{self[id].identify}:\n " + msg + "\n"
83
227
  end
84
228
  end
85
229
  end
@@ -91,7 +235,7 @@ module Eco
91
235
  def print_analysis(**options)
92
236
  analysed = options[:analysed] || results_with_false_positives.analyse(**options)
93
237
  analysed.each_with_object({}) do |(id, results), out|
94
- puts analysis(analysed)
238
+ puts report(analysed)
95
239
  end
96
240
  end
97
241
  # @!endgroup
@@ -105,6 +249,22 @@ module Eco
105
249
 
106
250
  private
107
251
 
252
+ def print_progress(msg, total, num)
253
+ return unless total > 10
254
+ puts "" unless num > 1
255
+ @print_msg_len ||= 0
256
+ percent = (100 * num.to_f / total).round(1)
257
+ msg = " #{msg}: #{percent}% (#{num} of #{total})\r"
258
+ @print_msg_len = msg.length unless @print_msg_len > msg.length
259
+ print msg
260
+ $stdout.flush
261
+ if percent > 99.9
262
+ sleep(0.2)
263
+ print "#{" " * @print_msg_len}\r"
264
+ $stdout.flush
265
+ end
266
+ end
267
+
108
268
 
109
269
  end
110
270
  end
@@ -42,6 +42,39 @@ module Eco
42
42
  init_hashes
43
43
  end
44
44
 
45
+ # Updates the tag of the current tree
46
+ def tag=(value)
47
+ @tag = value
48
+ end
49
+
50
+ # @return [Eco::API::Organization::TagTree]
51
+ def dup
52
+ self.class.new(as_json)
53
+ end
54
+
55
+ # @return [Array] with the differences
56
+ def diff(tagtree, differences: {}, level: 0, **options)
57
+ require 'hashdiff'
58
+ Hashdiff.diff(self.as_json, tagtree.as_json, **options.slice(:array_path, :similarity, :use_lcs))
59
+ end
60
+
61
+ def top?
62
+ depth == -1
63
+ end
64
+
65
+ # @return [Array[Hash]] where `Hash` is a `node` `{"tag" => TAG, "nodes": Array[Hash]}`
66
+ def as_json
67
+ nodes_json = nodes.map {|node| node.as_json}
68
+ if top?
69
+ nodes_json
70
+ else
71
+ {
72
+ "tag" => tag,
73
+ "nodes" => nodes_json
74
+ }
75
+ end
76
+ end
77
+
45
78
  # @return [Boolean] `true` if there are tags in the node, `false` otherwise.
46
79
  def empty?
47
80
  @has_tags.empty?