data_collector 0.4.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5fb3cc5c42f536a8bdc99421eb6f46972c6c97597f5c0af63e96463f320837ef
4
- data.tar.gz: d377133bf042985f6d68b2bb2960d9c4ddae169a39d4602df4a3f4003d6b6434
3
+ metadata.gz: bfc42cf6807d68d0bf356494918fb3309ad19de5b037e9fac5f374c82a267ef6
4
+ data.tar.gz: a13d553f5003943755cc3e14dbdfd3fbf22a6ee00625ace045e468dba7ebf160
5
5
  SHA512:
6
- metadata.gz: cd48af243e1615e8cafaf8ed1ab4d9019661e3205286fbd32c9ec4bb3e2faa7e07025a62b378062f10dfc0f1775acb401ad4cb901851cbbaf4cb78a2e7c6474a
7
- data.tar.gz: c92eee26040f89ab0b691ba6bf4ad9792222e545e82bb9ebea8b4a4941831615a74d9f389e58d6feccf89ba42c6dd5a02c55bfed996fc670cd1d9600d5023476
6
+ metadata.gz: 6ec4eab7ef70145e63a84c814683e5faf026ab37d51e687f24b456a91d66e1bbb22955ec2de70f59eb6c877f1c133c1b62344dc7382abaa859f36c85d82a0fa3
7
+ data.tar.gz: d1f88a5d04f68c699837032f2f74fd30ebfcad7a492dc0009c211e9462a6c8f711141809dc1f01ad1b38de604353a490008d0470b767278dcd088ef2d6f37ef9
data/README.md CHANGED
@@ -1,10 +1,15 @@
1
1
  # DataCollector
2
- Convenience module to Extract, Transform and Load your data.
3
- You have 3 main objects that you can use for ETL => INPUT, OUTPUT and FILTER
4
- and support objects like CONFIG, LOG, RULES and the new RULES_NG
2
+ Convenience module to Extract, Transform and Load your data.
3
+ You have main objects that help you to 'INPUT', 'OUTPUT' and 'FILTER' data. The basic ETL components.
4
+ Support objects like CONFIG, LOG, RULES and the new RULES_NG just to make life easier.
5
5
 
6
6
  Including the DataCollector::Core module into your application gives you access to these objects.
7
7
 
8
+ The RULES and RULES_NG objects work in a very simple concept. Rules exist of 3 components:
9
+ - a destination tag
10
+ - a jsonpath filter to get the data
11
+ - a lambda to execute on every filter hit
12
+
8
13
 
9
14
  #### input
10
15
  Read input from an URI. This URI can have a http, https or file scheme
@@ -92,7 +97,7 @@ filter data from a hash using [JSONPath](http://goessner.net/articles/JsonPath/i
92
97
  filtered_data = filter(data, "$..metadata.record")
93
98
  ```
94
99
 
95
- #### rules
100
+ #### rules (depricated)
96
101
  See newer rules_ng object
97
102
  Allows you to define a simple lambda structure to run against a JSONPath filter
98
103
 
@@ -138,6 +143,90 @@ RULE_SET
138
143
  SUFFIX
139
144
  ```
140
145
 
146
+ ##### Examples
147
+
148
+ Here you find different rule combination that are possible
149
+
150
+ ``` ruby
151
+ RULE_SETS = {
152
+ 'rs_only_filter' => {
153
+ 'only_filter' => "$.title"
154
+ },
155
+ 'rs_only_text' => {
156
+ 'plain_text_tag' => {
157
+ 'text' => 'hello world'
158
+ }
159
+ },
160
+ 'rs_text_with_suffix' => {
161
+ 'text_tag_with_suffix' => {
162
+ 'text' => ['hello_world', {'suffix' => '-suffix'}]
163
+ }
164
+ },
165
+ 'rs_map_with_json_filter' => {
166
+ 'language' => {
167
+ '@' => {'nl' => 'dut', 'fr' => 'fre', 'de' => 'ger', 'en' => 'eng'}
168
+ }
169
+ },
170
+ 'rs_hash_with_json_filter' => {
171
+ 'multiple_of_2' => {
172
+ '@' => lambda { |d| d.to_i * 2 }
173
+ }
174
+ },
175
+ 'rs_hash_with_multiple_json_filter' => {
176
+ 'multiple_of' => [
177
+ {'@' => lambda { |d| d.to_i * 2 }},
178
+ {'@' => lambda { |d| d.to_i * 3 }}
179
+ ]
180
+ },
181
+ 'rs_hash_with_json_filter_and_suffix' => {
182
+ 'multiple_of_with_suffix' => {
183
+ '@' => [lambda {|d| d.to_i*2}, 'suffix' => '-multiple_of_2']
184
+ }
185
+ },
186
+ 'rs_hash_with_json_filter_and_multiple_lambdas' => {
187
+ 'multiple_lambdas' => {
188
+ '@' => [lambda {|d| d.to_i*2}, lambda {|d| Math.sqrt(d.to_i) }]
189
+ }
190
+ },
191
+ 'rs_hash_with_json_filter_and_option' => {
192
+ 'subjects' => {
193
+ '$..subject' => [
194
+ lambda {|d,o|
195
+ {
196
+ doc_id: o['id'],
197
+ subject: d
198
+ }
199
+ }
200
+ ]
201
+ }
202
+ }
203
+ ```
204
+
205
+ Here is an example on how to call last RULESET "rs_hash_with_json_filter_and_option".
206
+
207
+ ***rules_ng.run*** can have 4 parameters. First 3 are mandatory. The last one ***options*** can hold data static to a rule set.
208
+
209
+ ```ruby
210
+ include DataCollector::Core
211
+ output.clear
212
+ data = {'subject' => ['water', 'thermodynamics']}
213
+
214
+ rules_ng.run(RULE_SETS['rs_hash_with_json_filter_and_option'], data, output, {'id' => 1})
215
+
216
+ ```
217
+
218
+ Results in:
219
+ ```json
220
+ {
221
+ "subjects":[
222
+ {"doc_id":1,"subject":"water"},
223
+ {"doc_id":1,"subject":"thermodynamics"}
224
+ ]
225
+ }
226
+ ```
227
+
228
+
229
+
141
230
  #### config
142
231
  config is an object that points to "config.yml" you can read and/or store data to this object.
143
232
 
@@ -32,7 +32,11 @@ module DataCollector
32
32
  unless v.nil?
33
33
  if data.has_key?(k)
34
34
  if data[k].is_a?(Array) then
35
- data[k] << v
35
+ if v.is_a?(Array)
36
+ data[k] += v
37
+ else
38
+ data[k] << v
39
+ end
36
40
  else
37
41
  t = data[k]
38
42
  data[k] = Array.new([t, v])
@@ -58,6 +62,22 @@ module DataCollector
58
62
  end
59
63
  end
60
64
 
65
+ def key?(key)
66
+ @data.key?(key)
67
+ end
68
+
69
+ def has_key?(key)
70
+ @data.key?(key)
71
+ end
72
+
73
+ def include?(key)
74
+ @data.key?(key)
75
+ end
76
+
77
+ def keys
78
+ @data.keys
79
+ end
80
+
61
81
  def raw
62
82
  @data
63
83
  end
@@ -10,29 +10,38 @@ module DataCollector
10
10
  rules.each do |tag, rule|
11
11
  apply_rule(tag, rule, input_data, output_data, options)
12
12
  end
13
+
14
+ output_data
13
15
  end
14
16
 
15
17
  private
18
+
16
19
  def apply_rule(tag, rule, input_data, output_data, options = {})
17
- if rule.is_a?(Array)
20
+ rule_filter = rule
21
+ rule_payload = ""
22
+
23
+ case rule
24
+ when Array
18
25
  rule.each do |sub_rule|
19
- apply_rule(tag, sub_rule, input_data, output_data, options)
26
+ apply_rule(tag, sub_rule, input_data, output_data, options)
20
27
  end
21
28
  return output_data
29
+ when String
30
+ rule_filter = rule
31
+ rule_payload = ""
32
+ else
33
+ rule_filter = rule.keys.first
34
+ rule_payload = rule.values.first
22
35
  end
23
36
 
24
- rule_filter = rule.keys.first
25
- rule_payload = rule.values.first
26
37
  case rule_filter
27
38
  when 'text'
28
39
  if rule_payload.is_a?(String)
29
40
  data = rule_payload
30
41
  else
31
- data = rule_payload.select{|s| s.is_a?(String)}
32
- rule_payload = rule_payload.delete_if{|s| s.is_a?(String)}
33
- if rule_payload.size == 1
34
- rule_payload = rule_payload.first
35
- end
42
+ data = rule_payload.select { |s| s.is_a?(String) }
43
+ rule_payload = rule_payload.reject { |s| s.is_a?(String) }
44
+ rule_payload = "@" if rule_payload.empty?
36
45
  end
37
46
  when /json_path\:/
38
47
  data = json_path_filter(rule_filter.gsub(/^json_path\:/), input_data)
@@ -42,17 +51,22 @@ module DataCollector
42
51
 
43
52
  data = apply_filtered_data_on_payload(data, rule_payload, options)
44
53
 
45
- output_data << {tag.to_sym => data}
54
+ output_data << {tag.to_sym => data} unless data.nil? || (data.is_a?(Array) && data.empty?)
55
+ rescue StandardError => e
56
+ puts "error running rule '#{tag}'\n\t#{e.message}"
57
+ puts e.backtrace.join("\n")
46
58
  end
47
59
 
48
60
  def apply_filtered_data_on_payload(input_data, payload, options = {})
61
+ return nil if input_data.nil?
49
62
  output_data = nil
50
63
  case payload.class.name
51
64
  when 'Proc'
65
+ data = input_data.is_a?(Array) ? input_data : [input_data]
52
66
  if options && options.empty?
53
- output_data = payload.call(input_data)
67
+ output_data = data.map { |d| payload.call(d) }
54
68
  else
55
- output_data = payload.call(input_data, options)
69
+ output_data = data.map { |d| payload.call(d, options) }
56
70
  end
57
71
  when 'Hash'
58
72
  input_data = [input_data] unless input_data.is_a?(Array)
@@ -69,10 +83,13 @@ module DataCollector
69
83
  output_data = apply_filtered_data_on_payload(output_data, p, options)
70
84
  end
71
85
  else
72
- output_data = input_data
86
+ output_data = [input_data]
73
87
  end
74
88
 
75
89
  output_data.compact! if output_data.is_a?(Array)
90
+ output_data.flatten! if output_data.is_a?(Array)# || output_data.is_a?(Hash)
91
+ output_data = output_data.first if output_data.is_a?(Array) && output_data.size == 1 && (output_data.first.is_a?(Array) || output_data.first.is_a?(Hash))
92
+
76
93
  output_data
77
94
  end
78
95
 
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module DataCollector
3
- VERSION = "0.4.0"
3
+ VERSION = "0.9.0"
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_collector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mehmet Celik
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-03-20 00:00:00.000000000 Z
11
+ date: 2020-06-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -226,7 +226,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
226
226
  - !ruby/object:Gem::Version
227
227
  version: '0'
228
228
  requirements: []
229
- rubygems_version: 3.0.2
229
+ rubygems_version: 3.0.6
230
230
  signing_key:
231
231
  specification_version: 4
232
232
  summary: ETL helper library