data_collector 0.4.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +93 -4
- data/lib/data_collector/output.rb +21 -1
- data/lib/data_collector/rules_ng.rb +30 -13
- data/lib/data_collector/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bfc42cf6807d68d0bf356494918fb3309ad19de5b037e9fac5f374c82a267ef6
|
4
|
+
data.tar.gz: a13d553f5003943755cc3e14dbdfd3fbf22a6ee00625ace045e468dba7ebf160
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6ec4eab7ef70145e63a84c814683e5faf026ab37d51e687f24b456a91d66e1bbb22955ec2de70f59eb6c877f1c133c1b62344dc7382abaa859f36c85d82a0fa3
|
7
|
+
data.tar.gz: d1f88a5d04f68c699837032f2f74fd30ebfcad7a492dc0009c211e9462a6c8f711141809dc1f01ad1b38de604353a490008d0470b767278dcd088ef2d6f37ef9
|
data/README.md
CHANGED
@@ -1,10 +1,15 @@
|
|
1
1
|
# DataCollector
|
2
|
-
Convenience module to Extract, Transform and Load your data.
|
3
|
-
You have
|
4
|
-
|
2
|
+
Convenience module to Extract, Transform and Load your data.
|
3
|
+
You have main objects that help you to 'INPUT', 'OUTPUT' and 'FILTER' data. The basic ETL components.
|
4
|
+
Support objects like CONFIG, LOG, RULES and the new RULES_NG just to make life easier.
|
5
5
|
|
6
6
|
Including the DataCollector::Core module into your application gives you access to these objects.
|
7
7
|
|
8
|
+
The RULES and RULES_NG objects work in a very simple concept. Rules exist of 3 components:
|
9
|
+
- a destination tag
|
10
|
+
- a jsonpath filter to get the data
|
11
|
+
- a lambda to execute on every filter hit
|
12
|
+
|
8
13
|
|
9
14
|
#### input
|
10
15
|
Read input from an URI. This URI can have a http, https or file scheme
|
@@ -92,7 +97,7 @@ filter data from a hash using [JSONPath](http://goessner.net/articles/JsonPath/i
|
|
92
97
|
filtered_data = filter(data, "$..metadata.record")
|
93
98
|
```
|
94
99
|
|
95
|
-
#### rules
|
100
|
+
#### rules (depricated)
|
96
101
|
See newer rules_ng object
|
97
102
|
Allows you to define a simple lambda structure to run against a JSONPath filter
|
98
103
|
|
@@ -138,6 +143,90 @@ RULE_SET
|
|
138
143
|
SUFFIX
|
139
144
|
```
|
140
145
|
|
146
|
+
##### Examples
|
147
|
+
|
148
|
+
Here you find different rule combination that are possible
|
149
|
+
|
150
|
+
``` ruby
|
151
|
+
RULE_SETS = {
|
152
|
+
'rs_only_filter' => {
|
153
|
+
'only_filter' => "$.title"
|
154
|
+
},
|
155
|
+
'rs_only_text' => {
|
156
|
+
'plain_text_tag' => {
|
157
|
+
'text' => 'hello world'
|
158
|
+
}
|
159
|
+
},
|
160
|
+
'rs_text_with_suffix' => {
|
161
|
+
'text_tag_with_suffix' => {
|
162
|
+
'text' => ['hello_world', {'suffix' => '-suffix'}]
|
163
|
+
}
|
164
|
+
},
|
165
|
+
'rs_map_with_json_filter' => {
|
166
|
+
'language' => {
|
167
|
+
'@' => {'nl' => 'dut', 'fr' => 'fre', 'de' => 'ger', 'en' => 'eng'}
|
168
|
+
}
|
169
|
+
},
|
170
|
+
'rs_hash_with_json_filter' => {
|
171
|
+
'multiple_of_2' => {
|
172
|
+
'@' => lambda { |d| d.to_i * 2 }
|
173
|
+
}
|
174
|
+
},
|
175
|
+
'rs_hash_with_multiple_json_filter' => {
|
176
|
+
'multiple_of' => [
|
177
|
+
{'@' => lambda { |d| d.to_i * 2 }},
|
178
|
+
{'@' => lambda { |d| d.to_i * 3 }}
|
179
|
+
]
|
180
|
+
},
|
181
|
+
'rs_hash_with_json_filter_and_suffix' => {
|
182
|
+
'multiple_of_with_suffix' => {
|
183
|
+
'@' => [lambda {|d| d.to_i*2}, 'suffix' => '-multiple_of_2']
|
184
|
+
}
|
185
|
+
},
|
186
|
+
'rs_hash_with_json_filter_and_multiple_lambdas' => {
|
187
|
+
'multiple_lambdas' => {
|
188
|
+
'@' => [lambda {|d| d.to_i*2}, lambda {|d| Math.sqrt(d.to_i) }]
|
189
|
+
}
|
190
|
+
},
|
191
|
+
'rs_hash_with_json_filter_and_option' => {
|
192
|
+
'subjects' => {
|
193
|
+
'$..subject' => [
|
194
|
+
lambda {|d,o|
|
195
|
+
{
|
196
|
+
doc_id: o['id'],
|
197
|
+
subject: d
|
198
|
+
}
|
199
|
+
}
|
200
|
+
]
|
201
|
+
}
|
202
|
+
}
|
203
|
+
```
|
204
|
+
|
205
|
+
Here is an example on how to call last RULESET "rs_hash_with_json_filter_and_option".
|
206
|
+
|
207
|
+
***rules_ng.run*** can have 4 parameters. First 3 are mandatory. The last one ***options*** can hold data static to a rule set.
|
208
|
+
|
209
|
+
```ruby
|
210
|
+
include DataCollector::Core
|
211
|
+
output.clear
|
212
|
+
data = {'subject' => ['water', 'thermodynamics']}
|
213
|
+
|
214
|
+
rules_ng.run(RULE_SETS['rs_hash_with_json_filter_and_option'], data, output, {'id' => 1})
|
215
|
+
|
216
|
+
```
|
217
|
+
|
218
|
+
Results in:
|
219
|
+
```json
|
220
|
+
{
|
221
|
+
"subjects":[
|
222
|
+
{"doc_id":1,"subject":"water"},
|
223
|
+
{"doc_id":1,"subject":"thermodynamics"}
|
224
|
+
]
|
225
|
+
}
|
226
|
+
```
|
227
|
+
|
228
|
+
|
229
|
+
|
141
230
|
#### config
|
142
231
|
config is an object that points to "config.yml" you can read and/or store data to this object.
|
143
232
|
|
@@ -32,7 +32,11 @@ module DataCollector
|
|
32
32
|
unless v.nil?
|
33
33
|
if data.has_key?(k)
|
34
34
|
if data[k].is_a?(Array) then
|
35
|
-
|
35
|
+
if v.is_a?(Array)
|
36
|
+
data[k] += v
|
37
|
+
else
|
38
|
+
data[k] << v
|
39
|
+
end
|
36
40
|
else
|
37
41
|
t = data[k]
|
38
42
|
data[k] = Array.new([t, v])
|
@@ -58,6 +62,22 @@ module DataCollector
|
|
58
62
|
end
|
59
63
|
end
|
60
64
|
|
65
|
+
def key?(key)
|
66
|
+
@data.key?(key)
|
67
|
+
end
|
68
|
+
|
69
|
+
def has_key?(key)
|
70
|
+
@data.key?(key)
|
71
|
+
end
|
72
|
+
|
73
|
+
def include?(key)
|
74
|
+
@data.key?(key)
|
75
|
+
end
|
76
|
+
|
77
|
+
def keys
|
78
|
+
@data.keys
|
79
|
+
end
|
80
|
+
|
61
81
|
def raw
|
62
82
|
@data
|
63
83
|
end
|
@@ -10,29 +10,38 @@ module DataCollector
|
|
10
10
|
rules.each do |tag, rule|
|
11
11
|
apply_rule(tag, rule, input_data, output_data, options)
|
12
12
|
end
|
13
|
+
|
14
|
+
output_data
|
13
15
|
end
|
14
16
|
|
15
17
|
private
|
18
|
+
|
16
19
|
def apply_rule(tag, rule, input_data, output_data, options = {})
|
17
|
-
|
20
|
+
rule_filter = rule
|
21
|
+
rule_payload = ""
|
22
|
+
|
23
|
+
case rule
|
24
|
+
when Array
|
18
25
|
rule.each do |sub_rule|
|
19
|
-
|
26
|
+
apply_rule(tag, sub_rule, input_data, output_data, options)
|
20
27
|
end
|
21
28
|
return output_data
|
29
|
+
when String
|
30
|
+
rule_filter = rule
|
31
|
+
rule_payload = ""
|
32
|
+
else
|
33
|
+
rule_filter = rule.keys.first
|
34
|
+
rule_payload = rule.values.first
|
22
35
|
end
|
23
36
|
|
24
|
-
rule_filter = rule.keys.first
|
25
|
-
rule_payload = rule.values.first
|
26
37
|
case rule_filter
|
27
38
|
when 'text'
|
28
39
|
if rule_payload.is_a?(String)
|
29
40
|
data = rule_payload
|
30
41
|
else
|
31
|
-
data = rule_payload.select{|s| s.is_a?(String)}
|
32
|
-
rule_payload = rule_payload.
|
33
|
-
if rule_payload.
|
34
|
-
rule_payload = rule_payload.first
|
35
|
-
end
|
42
|
+
data = rule_payload.select { |s| s.is_a?(String) }
|
43
|
+
rule_payload = rule_payload.reject { |s| s.is_a?(String) }
|
44
|
+
rule_payload = "@" if rule_payload.empty?
|
36
45
|
end
|
37
46
|
when /json_path\:/
|
38
47
|
data = json_path_filter(rule_filter.gsub(/^json_path\:/), input_data)
|
@@ -42,17 +51,22 @@ module DataCollector
|
|
42
51
|
|
43
52
|
data = apply_filtered_data_on_payload(data, rule_payload, options)
|
44
53
|
|
45
|
-
output_data << {tag.to_sym => data}
|
54
|
+
output_data << {tag.to_sym => data} unless data.nil? || (data.is_a?(Array) && data.empty?)
|
55
|
+
rescue StandardError => e
|
56
|
+
puts "error running rule '#{tag}'\n\t#{e.message}"
|
57
|
+
puts e.backtrace.join("\n")
|
46
58
|
end
|
47
59
|
|
48
60
|
def apply_filtered_data_on_payload(input_data, payload, options = {})
|
61
|
+
return nil if input_data.nil?
|
49
62
|
output_data = nil
|
50
63
|
case payload.class.name
|
51
64
|
when 'Proc'
|
65
|
+
data = input_data.is_a?(Array) ? input_data : [input_data]
|
52
66
|
if options && options.empty?
|
53
|
-
output_data = payload.call(
|
67
|
+
output_data = data.map { |d| payload.call(d) }
|
54
68
|
else
|
55
|
-
output_data = payload.call(
|
69
|
+
output_data = data.map { |d| payload.call(d, options) }
|
56
70
|
end
|
57
71
|
when 'Hash'
|
58
72
|
input_data = [input_data] unless input_data.is_a?(Array)
|
@@ -69,10 +83,13 @@ module DataCollector
|
|
69
83
|
output_data = apply_filtered_data_on_payload(output_data, p, options)
|
70
84
|
end
|
71
85
|
else
|
72
|
-
output_data = input_data
|
86
|
+
output_data = [input_data]
|
73
87
|
end
|
74
88
|
|
75
89
|
output_data.compact! if output_data.is_a?(Array)
|
90
|
+
output_data.flatten! if output_data.is_a?(Array)# || output_data.is_a?(Hash)
|
91
|
+
output_data = output_data.first if output_data.is_a?(Array) && output_data.size == 1 && (output_data.first.is_a?(Array) || output_data.first.is_a?(Hash))
|
92
|
+
|
76
93
|
output_data
|
77
94
|
end
|
78
95
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_collector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Celik
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-03
|
11
|
+
date: 2020-06-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -226,7 +226,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
226
226
|
- !ruby/object:Gem::Version
|
227
227
|
version: '0'
|
228
228
|
requirements: []
|
229
|
-
rubygems_version: 3.0.
|
229
|
+
rubygems_version: 3.0.6
|
230
230
|
signing_key:
|
231
231
|
specification_version: 4
|
232
232
|
summary: ETL helper library
|