data_collector 0.4.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +93 -4
- data/lib/data_collector/output.rb +21 -1
- data/lib/data_collector/rules_ng.rb +30 -13
- data/lib/data_collector/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bfc42cf6807d68d0bf356494918fb3309ad19de5b037e9fac5f374c82a267ef6
|
4
|
+
data.tar.gz: a13d553f5003943755cc3e14dbdfd3fbf22a6ee00625ace045e468dba7ebf160
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6ec4eab7ef70145e63a84c814683e5faf026ab37d51e687f24b456a91d66e1bbb22955ec2de70f59eb6c877f1c133c1b62344dc7382abaa859f36c85d82a0fa3
|
7
|
+
data.tar.gz: d1f88a5d04f68c699837032f2f74fd30ebfcad7a492dc0009c211e9462a6c8f711141809dc1f01ad1b38de604353a490008d0470b767278dcd088ef2d6f37ef9
|
data/README.md
CHANGED
@@ -1,10 +1,15 @@
|
|
1
1
|
# DataCollector
|
2
|
-
Convenience module to Extract, Transform and Load your data.
|
3
|
-
You have
|
4
|
-
|
2
|
+
Convenience module to Extract, Transform and Load your data.
|
3
|
+
You have main objects that help you to 'INPUT', 'OUTPUT' and 'FILTER' data. The basic ETL components.
|
4
|
+
Support objects like CONFIG, LOG, RULES and the new RULES_NG just to make life easier.
|
5
5
|
|
6
6
|
Including the DataCollector::Core module into your application gives you access to these objects.
|
7
7
|
|
8
|
+
The RULES and RULES_NG objects work in a very simple concept. Rules exist of 3 components:
|
9
|
+
- a destination tag
|
10
|
+
- a jsonpath filter to get the data
|
11
|
+
- a lambda to execute on every filter hit
|
12
|
+
|
8
13
|
|
9
14
|
#### input
|
10
15
|
Read input from an URI. This URI can have a http, https or file scheme
|
@@ -92,7 +97,7 @@ filter data from a hash using [JSONPath](http://goessner.net/articles/JsonPath/i
|
|
92
97
|
filtered_data = filter(data, "$..metadata.record")
|
93
98
|
```
|
94
99
|
|
95
|
-
#### rules
|
100
|
+
#### rules (depricated)
|
96
101
|
See newer rules_ng object
|
97
102
|
Allows you to define a simple lambda structure to run against a JSONPath filter
|
98
103
|
|
@@ -138,6 +143,90 @@ RULE_SET
|
|
138
143
|
SUFFIX
|
139
144
|
```
|
140
145
|
|
146
|
+
##### Examples
|
147
|
+
|
148
|
+
Here you find different rule combination that are possible
|
149
|
+
|
150
|
+
``` ruby
|
151
|
+
RULE_SETS = {
|
152
|
+
'rs_only_filter' => {
|
153
|
+
'only_filter' => "$.title"
|
154
|
+
},
|
155
|
+
'rs_only_text' => {
|
156
|
+
'plain_text_tag' => {
|
157
|
+
'text' => 'hello world'
|
158
|
+
}
|
159
|
+
},
|
160
|
+
'rs_text_with_suffix' => {
|
161
|
+
'text_tag_with_suffix' => {
|
162
|
+
'text' => ['hello_world', {'suffix' => '-suffix'}]
|
163
|
+
}
|
164
|
+
},
|
165
|
+
'rs_map_with_json_filter' => {
|
166
|
+
'language' => {
|
167
|
+
'@' => {'nl' => 'dut', 'fr' => 'fre', 'de' => 'ger', 'en' => 'eng'}
|
168
|
+
}
|
169
|
+
},
|
170
|
+
'rs_hash_with_json_filter' => {
|
171
|
+
'multiple_of_2' => {
|
172
|
+
'@' => lambda { |d| d.to_i * 2 }
|
173
|
+
}
|
174
|
+
},
|
175
|
+
'rs_hash_with_multiple_json_filter' => {
|
176
|
+
'multiple_of' => [
|
177
|
+
{'@' => lambda { |d| d.to_i * 2 }},
|
178
|
+
{'@' => lambda { |d| d.to_i * 3 }}
|
179
|
+
]
|
180
|
+
},
|
181
|
+
'rs_hash_with_json_filter_and_suffix' => {
|
182
|
+
'multiple_of_with_suffix' => {
|
183
|
+
'@' => [lambda {|d| d.to_i*2}, 'suffix' => '-multiple_of_2']
|
184
|
+
}
|
185
|
+
},
|
186
|
+
'rs_hash_with_json_filter_and_multiple_lambdas' => {
|
187
|
+
'multiple_lambdas' => {
|
188
|
+
'@' => [lambda {|d| d.to_i*2}, lambda {|d| Math.sqrt(d.to_i) }]
|
189
|
+
}
|
190
|
+
},
|
191
|
+
'rs_hash_with_json_filter_and_option' => {
|
192
|
+
'subjects' => {
|
193
|
+
'$..subject' => [
|
194
|
+
lambda {|d,o|
|
195
|
+
{
|
196
|
+
doc_id: o['id'],
|
197
|
+
subject: d
|
198
|
+
}
|
199
|
+
}
|
200
|
+
]
|
201
|
+
}
|
202
|
+
}
|
203
|
+
```
|
204
|
+
|
205
|
+
Here is an example on how to call last RULESET "rs_hash_with_json_filter_and_option".
|
206
|
+
|
207
|
+
***rules_ng.run*** can have 4 parameters. First 3 are mandatory. The last one ***options*** can hold data static to a rule set.
|
208
|
+
|
209
|
+
```ruby
|
210
|
+
include DataCollector::Core
|
211
|
+
output.clear
|
212
|
+
data = {'subject' => ['water', 'thermodynamics']}
|
213
|
+
|
214
|
+
rules_ng.run(RULE_SETS['rs_hash_with_json_filter_and_option'], data, output, {'id' => 1})
|
215
|
+
|
216
|
+
```
|
217
|
+
|
218
|
+
Results in:
|
219
|
+
```json
|
220
|
+
{
|
221
|
+
"subjects":[
|
222
|
+
{"doc_id":1,"subject":"water"},
|
223
|
+
{"doc_id":1,"subject":"thermodynamics"}
|
224
|
+
]
|
225
|
+
}
|
226
|
+
```
|
227
|
+
|
228
|
+
|
229
|
+
|
141
230
|
#### config
|
142
231
|
config is an object that points to "config.yml" you can read and/or store data to this object.
|
143
232
|
|
@@ -32,7 +32,11 @@ module DataCollector
|
|
32
32
|
unless v.nil?
|
33
33
|
if data.has_key?(k)
|
34
34
|
if data[k].is_a?(Array) then
|
35
|
-
|
35
|
+
if v.is_a?(Array)
|
36
|
+
data[k] += v
|
37
|
+
else
|
38
|
+
data[k] << v
|
39
|
+
end
|
36
40
|
else
|
37
41
|
t = data[k]
|
38
42
|
data[k] = Array.new([t, v])
|
@@ -58,6 +62,22 @@ module DataCollector
|
|
58
62
|
end
|
59
63
|
end
|
60
64
|
|
65
|
+
def key?(key)
|
66
|
+
@data.key?(key)
|
67
|
+
end
|
68
|
+
|
69
|
+
def has_key?(key)
|
70
|
+
@data.key?(key)
|
71
|
+
end
|
72
|
+
|
73
|
+
def include?(key)
|
74
|
+
@data.key?(key)
|
75
|
+
end
|
76
|
+
|
77
|
+
def keys
|
78
|
+
@data.keys
|
79
|
+
end
|
80
|
+
|
61
81
|
def raw
|
62
82
|
@data
|
63
83
|
end
|
@@ -10,29 +10,38 @@ module DataCollector
|
|
10
10
|
rules.each do |tag, rule|
|
11
11
|
apply_rule(tag, rule, input_data, output_data, options)
|
12
12
|
end
|
13
|
+
|
14
|
+
output_data
|
13
15
|
end
|
14
16
|
|
15
17
|
private
|
18
|
+
|
16
19
|
def apply_rule(tag, rule, input_data, output_data, options = {})
|
17
|
-
|
20
|
+
rule_filter = rule
|
21
|
+
rule_payload = ""
|
22
|
+
|
23
|
+
case rule
|
24
|
+
when Array
|
18
25
|
rule.each do |sub_rule|
|
19
|
-
|
26
|
+
apply_rule(tag, sub_rule, input_data, output_data, options)
|
20
27
|
end
|
21
28
|
return output_data
|
29
|
+
when String
|
30
|
+
rule_filter = rule
|
31
|
+
rule_payload = ""
|
32
|
+
else
|
33
|
+
rule_filter = rule.keys.first
|
34
|
+
rule_payload = rule.values.first
|
22
35
|
end
|
23
36
|
|
24
|
-
rule_filter = rule.keys.first
|
25
|
-
rule_payload = rule.values.first
|
26
37
|
case rule_filter
|
27
38
|
when 'text'
|
28
39
|
if rule_payload.is_a?(String)
|
29
40
|
data = rule_payload
|
30
41
|
else
|
31
|
-
data = rule_payload.select{|s| s.is_a?(String)}
|
32
|
-
rule_payload = rule_payload.
|
33
|
-
if rule_payload.
|
34
|
-
rule_payload = rule_payload.first
|
35
|
-
end
|
42
|
+
data = rule_payload.select { |s| s.is_a?(String) }
|
43
|
+
rule_payload = rule_payload.reject { |s| s.is_a?(String) }
|
44
|
+
rule_payload = "@" if rule_payload.empty?
|
36
45
|
end
|
37
46
|
when /json_path\:/
|
38
47
|
data = json_path_filter(rule_filter.gsub(/^json_path\:/), input_data)
|
@@ -42,17 +51,22 @@ module DataCollector
|
|
42
51
|
|
43
52
|
data = apply_filtered_data_on_payload(data, rule_payload, options)
|
44
53
|
|
45
|
-
output_data << {tag.to_sym => data}
|
54
|
+
output_data << {tag.to_sym => data} unless data.nil? || (data.is_a?(Array) && data.empty?)
|
55
|
+
rescue StandardError => e
|
56
|
+
puts "error running rule '#{tag}'\n\t#{e.message}"
|
57
|
+
puts e.backtrace.join("\n")
|
46
58
|
end
|
47
59
|
|
48
60
|
def apply_filtered_data_on_payload(input_data, payload, options = {})
|
61
|
+
return nil if input_data.nil?
|
49
62
|
output_data = nil
|
50
63
|
case payload.class.name
|
51
64
|
when 'Proc'
|
65
|
+
data = input_data.is_a?(Array) ? input_data : [input_data]
|
52
66
|
if options && options.empty?
|
53
|
-
output_data = payload.call(
|
67
|
+
output_data = data.map { |d| payload.call(d) }
|
54
68
|
else
|
55
|
-
output_data = payload.call(
|
69
|
+
output_data = data.map { |d| payload.call(d, options) }
|
56
70
|
end
|
57
71
|
when 'Hash'
|
58
72
|
input_data = [input_data] unless input_data.is_a?(Array)
|
@@ -69,10 +83,13 @@ module DataCollector
|
|
69
83
|
output_data = apply_filtered_data_on_payload(output_data, p, options)
|
70
84
|
end
|
71
85
|
else
|
72
|
-
output_data = input_data
|
86
|
+
output_data = [input_data]
|
73
87
|
end
|
74
88
|
|
75
89
|
output_data.compact! if output_data.is_a?(Array)
|
90
|
+
output_data.flatten! if output_data.is_a?(Array)# || output_data.is_a?(Hash)
|
91
|
+
output_data = output_data.first if output_data.is_a?(Array) && output_data.size == 1 && (output_data.first.is_a?(Array) || output_data.first.is_a?(Hash))
|
92
|
+
|
76
93
|
output_data
|
77
94
|
end
|
78
95
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_collector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Celik
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-03
|
11
|
+
date: 2020-06-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -226,7 +226,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
226
226
|
- !ruby/object:Gem::Version
|
227
227
|
version: '0'
|
228
228
|
requirements: []
|
229
|
-
rubygems_version: 3.0.
|
229
|
+
rubygems_version: 3.0.6
|
230
230
|
signing_key:
|
231
231
|
specification_version: 4
|
232
232
|
summary: ETL helper library
|