data_collector 0.4.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5fb3cc5c42f536a8bdc99421eb6f46972c6c97597f5c0af63e96463f320837ef
4
- data.tar.gz: d377133bf042985f6d68b2bb2960d9c4ddae169a39d4602df4a3f4003d6b6434
3
+ metadata.gz: bfc42cf6807d68d0bf356494918fb3309ad19de5b037e9fac5f374c82a267ef6
4
+ data.tar.gz: a13d553f5003943755cc3e14dbdfd3fbf22a6ee00625ace045e468dba7ebf160
5
5
  SHA512:
6
- metadata.gz: cd48af243e1615e8cafaf8ed1ab4d9019661e3205286fbd32c9ec4bb3e2faa7e07025a62b378062f10dfc0f1775acb401ad4cb901851cbbaf4cb78a2e7c6474a
7
- data.tar.gz: c92eee26040f89ab0b691ba6bf4ad9792222e545e82bb9ebea8b4a4941831615a74d9f389e58d6feccf89ba42c6dd5a02c55bfed996fc670cd1d9600d5023476
6
+ metadata.gz: 6ec4eab7ef70145e63a84c814683e5faf026ab37d51e687f24b456a91d66e1bbb22955ec2de70f59eb6c877f1c133c1b62344dc7382abaa859f36c85d82a0fa3
7
+ data.tar.gz: d1f88a5d04f68c699837032f2f74fd30ebfcad7a492dc0009c211e9462a6c8f711141809dc1f01ad1b38de604353a490008d0470b767278dcd088ef2d6f37ef9
data/README.md CHANGED
@@ -1,10 +1,15 @@
1
1
  # DataCollector
2
- Convenience module to Extract, Transform and Load your data.
3
- You have 3 main objects that you can use for ETL => INPUT, OUTPUT and FILTER
4
- and support objects like CONFIG, LOG, RULES and the new RULES_NG
2
+ Convenience module to Extract, Transform and Load your data.
3
+ You have main objects that help you to 'INPUT', 'OUTPUT' and 'FILTER' data. The basic ETL components.
4
+ Support objects like CONFIG, LOG, RULES and the new RULES_NG just to make life easier.
5
5
 
6
6
  Including the DataCollector::Core module into your application gives you access to these objects.
7
7
 
8
+ The RULES and RULES_NG objects work in a very simple concept. Rules exist of 3 components:
9
+ - a destination tag
10
+ - a jsonpath filter to get the data
11
+ - a lambda to execute on every filter hit
12
+
8
13
 
9
14
  #### input
10
15
  Read input from an URI. This URI can have a http, https or file scheme
@@ -92,7 +97,7 @@ filter data from a hash using [JSONPath](http://goessner.net/articles/JsonPath/i
92
97
  filtered_data = filter(data, "$..metadata.record")
93
98
  ```
94
99
 
95
- #### rules
100
+ #### rules (depricated)
96
101
  See newer rules_ng object
97
102
  Allows you to define a simple lambda structure to run against a JSONPath filter
98
103
 
@@ -138,6 +143,90 @@ RULE_SET
138
143
  SUFFIX
139
144
  ```
140
145
 
146
+ ##### Examples
147
+
148
+ Here you find different rule combination that are possible
149
+
150
+ ``` ruby
151
+ RULE_SETS = {
152
+ 'rs_only_filter' => {
153
+ 'only_filter' => "$.title"
154
+ },
155
+ 'rs_only_text' => {
156
+ 'plain_text_tag' => {
157
+ 'text' => 'hello world'
158
+ }
159
+ },
160
+ 'rs_text_with_suffix' => {
161
+ 'text_tag_with_suffix' => {
162
+ 'text' => ['hello_world', {'suffix' => '-suffix'}]
163
+ }
164
+ },
165
+ 'rs_map_with_json_filter' => {
166
+ 'language' => {
167
+ '@' => {'nl' => 'dut', 'fr' => 'fre', 'de' => 'ger', 'en' => 'eng'}
168
+ }
169
+ },
170
+ 'rs_hash_with_json_filter' => {
171
+ 'multiple_of_2' => {
172
+ '@' => lambda { |d| d.to_i * 2 }
173
+ }
174
+ },
175
+ 'rs_hash_with_multiple_json_filter' => {
176
+ 'multiple_of' => [
177
+ {'@' => lambda { |d| d.to_i * 2 }},
178
+ {'@' => lambda { |d| d.to_i * 3 }}
179
+ ]
180
+ },
181
+ 'rs_hash_with_json_filter_and_suffix' => {
182
+ 'multiple_of_with_suffix' => {
183
+ '@' => [lambda {|d| d.to_i*2}, 'suffix' => '-multiple_of_2']
184
+ }
185
+ },
186
+ 'rs_hash_with_json_filter_and_multiple_lambdas' => {
187
+ 'multiple_lambdas' => {
188
+ '@' => [lambda {|d| d.to_i*2}, lambda {|d| Math.sqrt(d.to_i) }]
189
+ }
190
+ },
191
+ 'rs_hash_with_json_filter_and_option' => {
192
+ 'subjects' => {
193
+ '$..subject' => [
194
+ lambda {|d,o|
195
+ {
196
+ doc_id: o['id'],
197
+ subject: d
198
+ }
199
+ }
200
+ ]
201
+ }
202
+ }
203
+ ```
204
+
205
+ Here is an example on how to call last RULESET "rs_hash_with_json_filter_and_option".
206
+
207
+ ***rules_ng.run*** can have 4 parameters. First 3 are mandatory. The last one ***options*** can hold data static to a rule set.
208
+
209
+ ```ruby
210
+ include DataCollector::Core
211
+ output.clear
212
+ data = {'subject' => ['water', 'thermodynamics']}
213
+
214
+ rules_ng.run(RULE_SETS['rs_hash_with_json_filter_and_option'], data, output, {'id' => 1})
215
+
216
+ ```
217
+
218
+ Results in:
219
+ ```json
220
+ {
221
+ "subjects":[
222
+ {"doc_id":1,"subject":"water"},
223
+ {"doc_id":1,"subject":"thermodynamics"}
224
+ ]
225
+ }
226
+ ```
227
+
228
+
229
+
141
230
  #### config
142
231
  config is an object that points to "config.yml" you can read and/or store data to this object.
143
232
 
@@ -32,7 +32,11 @@ module DataCollector
32
32
  unless v.nil?
33
33
  if data.has_key?(k)
34
34
  if data[k].is_a?(Array) then
35
- data[k] << v
35
+ if v.is_a?(Array)
36
+ data[k] += v
37
+ else
38
+ data[k] << v
39
+ end
36
40
  else
37
41
  t = data[k]
38
42
  data[k] = Array.new([t, v])
@@ -58,6 +62,22 @@ module DataCollector
58
62
  end
59
63
  end
60
64
 
65
+ def key?(key)
66
+ @data.key?(key)
67
+ end
68
+
69
+ def has_key?(key)
70
+ @data.key?(key)
71
+ end
72
+
73
+ def include?(key)
74
+ @data.key?(key)
75
+ end
76
+
77
+ def keys
78
+ @data.keys
79
+ end
80
+
61
81
  def raw
62
82
  @data
63
83
  end
@@ -10,29 +10,38 @@ module DataCollector
10
10
  rules.each do |tag, rule|
11
11
  apply_rule(tag, rule, input_data, output_data, options)
12
12
  end
13
+
14
+ output_data
13
15
  end
14
16
 
15
17
  private
18
+
16
19
  def apply_rule(tag, rule, input_data, output_data, options = {})
17
- if rule.is_a?(Array)
20
+ rule_filter = rule
21
+ rule_payload = ""
22
+
23
+ case rule
24
+ when Array
18
25
  rule.each do |sub_rule|
19
- apply_rule(tag, sub_rule, input_data, output_data, options)
26
+ apply_rule(tag, sub_rule, input_data, output_data, options)
20
27
  end
21
28
  return output_data
29
+ when String
30
+ rule_filter = rule
31
+ rule_payload = ""
32
+ else
33
+ rule_filter = rule.keys.first
34
+ rule_payload = rule.values.first
22
35
  end
23
36
 
24
- rule_filter = rule.keys.first
25
- rule_payload = rule.values.first
26
37
  case rule_filter
27
38
  when 'text'
28
39
  if rule_payload.is_a?(String)
29
40
  data = rule_payload
30
41
  else
31
- data = rule_payload.select{|s| s.is_a?(String)}
32
- rule_payload = rule_payload.delete_if{|s| s.is_a?(String)}
33
- if rule_payload.size == 1
34
- rule_payload = rule_payload.first
35
- end
42
+ data = rule_payload.select { |s| s.is_a?(String) }
43
+ rule_payload = rule_payload.reject { |s| s.is_a?(String) }
44
+ rule_payload = "@" if rule_payload.empty?
36
45
  end
37
46
  when /json_path\:/
38
47
  data = json_path_filter(rule_filter.gsub(/^json_path\:/), input_data)
@@ -42,17 +51,22 @@ module DataCollector
42
51
 
43
52
  data = apply_filtered_data_on_payload(data, rule_payload, options)
44
53
 
45
- output_data << {tag.to_sym => data}
54
+ output_data << {tag.to_sym => data} unless data.nil? || (data.is_a?(Array) && data.empty?)
55
+ rescue StandardError => e
56
+ puts "error running rule '#{tag}'\n\t#{e.message}"
57
+ puts e.backtrace.join("\n")
46
58
  end
47
59
 
48
60
  def apply_filtered_data_on_payload(input_data, payload, options = {})
61
+ return nil if input_data.nil?
49
62
  output_data = nil
50
63
  case payload.class.name
51
64
  when 'Proc'
65
+ data = input_data.is_a?(Array) ? input_data : [input_data]
52
66
  if options && options.empty?
53
- output_data = payload.call(input_data)
67
+ output_data = data.map { |d| payload.call(d) }
54
68
  else
55
- output_data = payload.call(input_data, options)
69
+ output_data = data.map { |d| payload.call(d, options) }
56
70
  end
57
71
  when 'Hash'
58
72
  input_data = [input_data] unless input_data.is_a?(Array)
@@ -69,10 +83,13 @@ module DataCollector
69
83
  output_data = apply_filtered_data_on_payload(output_data, p, options)
70
84
  end
71
85
  else
72
- output_data = input_data
86
+ output_data = [input_data]
73
87
  end
74
88
 
75
89
  output_data.compact! if output_data.is_a?(Array)
90
+ output_data.flatten! if output_data.is_a?(Array)# || output_data.is_a?(Hash)
91
+ output_data = output_data.first if output_data.is_a?(Array) && output_data.size == 1 && (output_data.first.is_a?(Array) || output_data.first.is_a?(Hash))
92
+
76
93
  output_data
77
94
  end
78
95
 
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module DataCollector
3
- VERSION = "0.4.0"
3
+ VERSION = "0.9.0"
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_collector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mehmet Celik
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-03-20 00:00:00.000000000 Z
11
+ date: 2020-06-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -226,7 +226,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
226
226
  - !ruby/object:Gem::Version
227
227
  version: '0'
228
228
  requirements: []
229
- rubygems_version: 3.0.2
229
+ rubygems_version: 3.0.6
230
230
  signing_key:
231
231
  specification_version: 4
232
232
  summary: ETL helper library