data_collector 0.62.0 → 0.63.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +404 -337
- data/lib/data_collector/rules_ng.rb +14 -7
- data/lib/data_collector/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9006d40456c852cf21b98977606ed64c8211a40e9796e94bb7ec07c5837c1b03
|
4
|
+
data.tar.gz: ab5172a04a592e5a319d95f9e6e4918eb3e6eb3f7783b2583a89428a5145efca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6ff82dcd3e220c5b3ef6d5ffc6719f3c06e90b4aa71490feb0f07c0c4375f16bbddb0e7f80a3a3dd96870b85a018ccd4da091d7acf321141c85b2f9f77043f39
|
7
|
+
data.tar.gz: c6d4c02d7c93fd4bb90813bcd20285a40dd781ba161e38770ab9ba627609c08ef11cd56bb8a6703030a1573ade7e6a7011e37068a97226ce334947980215d171
|
data/README.md
CHANGED
@@ -1,364 +1,452 @@
|
|
1
|
-
# DataCollector
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
1
|
+
# DataCollector Ruby Gem
|
2
|
+
|
3
|
+
## Overview
|
4
|
+
|
5
|
+
DataCollector is a convenience module for Extract, Transform, and Load (ETL) operations in a pipeline architecture. It provides a simple way to collect, process, transform, and transfer data to various systems and applications.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'data_collector'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
```
|
18
|
+
$ bundle
|
19
|
+
```
|
20
|
+
|
21
|
+
Or install it yourself as:
|
22
|
+
|
23
|
+
```
|
24
|
+
$ gem install data_collector
|
25
|
+
```
|
26
|
+
|
27
|
+
## Getting Started
|
28
|
+
|
29
|
+
Include the DataCollector::Core module in your application to access all available objects:
|
30
|
+
|
6
31
|
```ruby
|
32
|
+
require 'data_collector'
|
7
33
|
include DataCollector::Core
|
8
34
|
```
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
+
|
36
|
+
This gives you access to the following objects: `pipeline`, `input`, `output`, `filter`, `rules`, `config`, `log`, and `error`.
|
37
|
+
|
38
|
+
## Core Components
|
39
|
+
|
40
|
+
### Pipeline
|
41
|
+
|
42
|
+
The Pipeline object allows you to create a data processing pipeline with scheduled execution.
|
43
|
+
|
44
|
+
#### Methods
|
45
|
+
|
46
|
+
- `.new(options)`: Create a new pipeline
|
47
|
+
- Options:
|
48
|
+
- `name`: Pipeline name
|
49
|
+
- `schedule`: ISO8601 duration format (e.g., 'PT10M' for every 10 minutes)
|
50
|
+
- `cron`: Cron format (e.g., '0 6 * * *' for 6:00 AM daily)
|
51
|
+
- `uri`: Directory/file to watch
|
52
|
+
- `xml_typecast`: Convert string values to appropriate types (true/false)
|
53
|
+
- `.run`: Start the pipeline (blocking if schedule is supplied)
|
54
|
+
- `.stop`: Stop the pipeline
|
55
|
+
- `.pause`: Pause the pipeline
|
56
|
+
- `.running?`: Check if pipeline is running
|
57
|
+
- `.stopped?`: Check if pipeline is not running
|
58
|
+
- `.paused?`: Check if pipeline is paused
|
59
|
+
- `.name`: Get pipeline name
|
60
|
+
- `.run_count`: Get number of times the pipeline has run
|
61
|
+
- `.on_message`: Handle to run every time a trigger event happens
|
62
|
+
|
63
|
+
#### Examples
|
64
|
+
|
65
|
+
Time-scheduled pipeline:
|
35
66
|
```ruby
|
36
|
-
#
|
67
|
+
# Run every 10 minutes
|
37
68
|
pipeline = Pipeline.new(schedule: 'PT10M')
|
38
69
|
|
39
70
|
pipeline.on_message do |input, output|
|
40
71
|
data = input.from_uri("https://dummyjson.com/comments?limit=10")
|
41
|
-
#
|
72
|
+
# Process data
|
42
73
|
end
|
43
74
|
|
44
75
|
pipeline.run
|
45
76
|
```
|
46
77
|
|
78
|
+
Cron-scheduled pipeline:
|
47
79
|
```ruby
|
48
|
-
#
|
80
|
+
# Run every morning at 06:00 AM
|
49
81
|
pipeline = Pipeline.new(cron: '0 6 * * *')
|
50
82
|
|
51
83
|
pipeline.on_message do |input, output|
|
52
84
|
data = input.from_uri("https://dummyjson.com/comments?limit=10")
|
53
|
-
#
|
85
|
+
# Process data
|
54
86
|
end
|
55
87
|
|
56
88
|
pipeline.run
|
57
89
|
```
|
58
90
|
|
59
|
-
|
91
|
+
File-watching pipeline:
|
60
92
|
```ruby
|
61
|
-
#
|
93
|
+
# Listen for and process files in a directory
|
62
94
|
extract = DataCollector::Pipeline.new(name: 'extract', uri: 'file://./data/in')
|
63
95
|
|
64
96
|
extract.on_message do |input, output, filename|
|
65
97
|
data = input.from_uri("file://#{filename}")
|
66
|
-
#
|
98
|
+
# Process data
|
67
99
|
end
|
68
100
|
|
69
101
|
extract.run
|
70
102
|
```
|
71
103
|
|
72
|
-
|
73
|
-
The input component is part of the processing logic. All data is converted into a Hash, Array, ... accessible using plain Ruby or JSONPath using the filter object.
|
74
|
-
The input component can fetch data from various URIs, such as files, URLs, directories, queues, ...
|
75
|
-
For a push input component, a listener is created with a processing logic block that is executed whenever new data is available.
|
76
|
-
A push happens when new data is created in a directory, message queue, ...
|
77
|
-
|
78
|
-
```ruby
|
79
|
-
from_uri(source, options = {:raw, :content_type, :headers, :cookies})
|
80
|
-
```
|
81
|
-
- source: an uri with a scheme of http, https, file, amqp
|
82
|
-
- options:
|
83
|
-
- raw: _boolean_ do not parse
|
84
|
-
- content_type: _string_ force a content_type if the 'Content-Type' returned by the http server is incorrect
|
85
|
-
- headers: request headers
|
86
|
-
- cookies: session cookies etc.
|
87
|
-
- method: http verb one of [GET, POST] defaul('GET')
|
88
|
-
- body: http post body
|
89
|
-
|
90
|
-
###### example:
|
91
|
-
```ruby
|
92
|
-
# read from an http endpoint
|
93
|
-
input.from_uri("http://www.libis.be")
|
94
|
-
input.from_uri("file://hello.txt")
|
95
|
-
input.from_uri("http://www.libis.be/record.jsonld", content_type: 'application/ld+json')
|
96
|
-
input.from_uri("https://www.w3.org/TR/rdf12-turtle/examples/example1.ttl")
|
97
|
-
input.from_uri("https://dbpedia.org/sparql", body: "query=SELECT * WHERE {?sub ?pred ?obj} LIMIT 10", method:"POST", headers: {accept: "text/turtle"})
|
98
|
-
input.from_uri(StringIO.new(File.read('myrecords.xml')), content_type: 'application/xml' )
|
99
|
-
|
100
|
-
# read data from a RabbitMQ queue
|
101
|
-
listener = input.from_uri('amqp://user:password@localhost?channel=hello&queue=world')
|
102
|
-
listener.on_message do |input, output, message|
|
103
|
-
puts message
|
104
|
-
end
|
105
|
-
listener.run
|
106
|
-
|
107
|
-
# read data from a directory
|
108
|
-
listener = input.from_uri('file://this/is/directory')
|
109
|
-
listener.on_message do |input, output, filename|
|
110
|
-
puts filename
|
111
|
-
end
|
112
|
-
listener.run
|
113
|
-
```
|
104
|
+
### Input
|
114
105
|
|
115
|
-
|
106
|
+
The input component fetches data from various URIs and converts it into Ruby objects (Hash, Array, etc.).
|
116
107
|
|
117
|
-
|
118
|
-
When a listener is defined that is triggered by an event(PUSH) like a message queue or files written to a directory you have these extra methods.
|
108
|
+
#### Methods
|
119
109
|
|
120
|
-
-
|
121
|
-
-
|
122
|
-
-
|
123
|
-
-
|
124
|
-
-
|
125
|
-
-
|
126
|
-
-
|
110
|
+
- `from_uri(source, options = {})`: Fetch data from a source
|
111
|
+
- Parameters:
|
112
|
+
- `source`: URI with scheme (http, https, file, amqp)
|
113
|
+
- `options`:
|
114
|
+
- `raw`: Boolean (do not parse)
|
115
|
+
- `content_type`: String (force a specific content type)
|
116
|
+
- `headers`: Request headers
|
117
|
+
- `cookies`: Session cookies
|
118
|
+
- `method`: HTTP verb (GET, POST)
|
119
|
+
- `body`: HTTP post body
|
127
120
|
|
128
|
-
|
129
|
-
Output is an object you can store key/value pairs that needs to be written to an output stream.
|
130
|
-
```ruby
|
131
|
-
output[:name] = 'John'
|
132
|
-
output[:last_name] = 'Doe'
|
133
|
-
```
|
121
|
+
#### Examples
|
134
122
|
|
123
|
+
HTTP and file sources:
|
135
124
|
```ruby
|
136
|
-
#
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
125
|
+
# Read from an HTTP endpoint
|
126
|
+
input.from_uri("http://www.libis.be")
|
127
|
+
|
128
|
+
# Read from a file
|
129
|
+
input.from_uri("file://hello.txt")
|
130
|
+
|
131
|
+
# Force content type
|
132
|
+
input.from_uri("http://www.libis.be/record.jsonld", content_type: 'application/ld+json')
|
133
|
+
|
134
|
+
# Read RDF/Turtle data
|
135
|
+
input.from_uri("https://www.w3.org/TR/rdf12-turtle/examples/example1.ttl")
|
136
|
+
|
137
|
+
# POST request
|
138
|
+
input.from_uri(
|
139
|
+
"https://dbpedia.org/sparql",
|
140
|
+
body: "query=SELECT * WHERE {?sub ?pred ?obj} LIMIT 10",
|
141
|
+
method: "POST",
|
142
|
+
headers: {accept: "text/turtle"}
|
143
|
+
)
|
144
|
+
|
145
|
+
# Read from StringIO
|
146
|
+
input.from_uri(
|
147
|
+
StringIO.new(File.read('myrecords.xml')),
|
148
|
+
content_type: 'application/xml'
|
149
|
+
)
|
142
150
|
```
|
143
|
-
```ruby
|
144
|
-
# add hash to output
|
145
|
-
output << { age: 22 }
|
146
151
|
|
147
|
-
|
148
|
-
# # 22
|
149
|
-
```
|
152
|
+
Message queues:
|
150
153
|
```ruby
|
151
|
-
#
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
154
|
+
# Read data from a RabbitMQ queue
|
155
|
+
listener = input.from_uri('amqp://user:password@localhost?channel=hello&queue=world')
|
156
|
+
listener.on_message do |input, output, message|
|
157
|
+
puts message
|
158
|
+
end
|
159
|
+
listener.run
|
157
160
|
```
|
158
161
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
</names>
|
168
|
-
```
|
169
|
-
will produce
|
170
|
-
```html
|
171
|
-
<names>
|
172
|
-
<combined>John Doe</combined>
|
173
|
-
<first_name>John</first_name>
|
174
|
-
<last_name>Doe</last_name>
|
175
|
-
</names>
|
162
|
+
Directory monitoring:
|
163
|
+
```ruby
|
164
|
+
# Read data from a directory
|
165
|
+
listener = input.from_uri('file://this/is/directory')
|
166
|
+
listener.on_message do |input, output, filename|
|
167
|
+
puts filename
|
168
|
+
end
|
169
|
+
listener.run
|
176
170
|
```
|
177
171
|
|
178
|
-
|
172
|
+
CSV files with options:
|
179
173
|
```ruby
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
```
|
174
|
+
# Load a CSV with semicolon separator
|
175
|
+
data = input.from_uri('https://example.com/data.csv', col_sep: ';')
|
176
|
+
```
|
184
177
|
|
185
|
-
|
186
|
-
```ruby
|
187
|
-
output.to_uri("file://data.xml", {template: "test.erb", content_type: "application/xml"})
|
188
|
-
#template is optional
|
189
|
-
output.to_uri("file://data.json", {content_type: "application/json"})
|
190
|
-
```
|
178
|
+
#### Listener Methods
|
191
179
|
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
180
|
+
When a listener is defined (for directories or message queues):
|
181
|
+
|
182
|
+
- `.run`: Start the listener (blocking)
|
183
|
+
- `.stop`: Stop the listener
|
184
|
+
- `.pause`: Pause the listener
|
185
|
+
- `.running?`: Check if listener is running
|
186
|
+
- `.stopped?`: Check if listener is not running
|
187
|
+
- `.paused?`: Check if listener is paused
|
188
|
+
- `.on_message`: Handle to run every time a trigger event happens
|
189
|
+
|
190
|
+
### Output
|
191
|
+
|
192
|
+
Output is an object for storing key/value pairs to be written to an output stream.
|
193
|
+
|
194
|
+
#### Basic Operations
|
199
195
|
|
200
|
-
Other output methods
|
201
196
|
```ruby
|
202
|
-
|
203
|
-
output
|
204
|
-
output
|
205
|
-
|
206
|
-
|
207
|
-
output.crush
|
197
|
+
# Set values
|
198
|
+
output[:name] = 'John'
|
199
|
+
output[:last_name] = 'Doe'
|
200
|
+
|
201
|
+
# Get all keys
|
208
202
|
output.keys
|
203
|
+
|
204
|
+
# Check if key exists
|
205
|
+
output.key?(:name)
|
206
|
+
|
207
|
+
# Iterate through keys and values
|
208
|
+
output.each do |k, v|
|
209
|
+
puts "#{k}:#{v}"
|
210
|
+
end
|
211
|
+
|
212
|
+
# Add hash to output
|
213
|
+
output << { age: 22 }
|
214
|
+
puts output[:age] # 22
|
215
|
+
|
216
|
+
# Add array to output
|
217
|
+
output << [1, 2, 3, 4]
|
218
|
+
puts output['datap'] # [1, 2, 3, 4]
|
219
|
+
|
220
|
+
# Clear output
|
221
|
+
output.clear
|
209
222
|
```
|
210
223
|
|
211
|
-
|
224
|
+
#### Output Methods
|
225
|
+
|
226
|
+
- `to_s(template = nil)`: Convert output to string (optional ERB template)
|
227
|
+
- `to_uri(uri, options = {})`: Write output to a URI
|
228
|
+
- Options:
|
229
|
+
- `template`: ERB template file
|
230
|
+
- `content_type`: MIME type
|
231
|
+
- `tar`: Create a tar file (true/false)
|
232
|
+
- `tar_name`: Custom name for tar file
|
233
|
+
- `to_tmp_file(template, directory)`: Write to temporary file
|
234
|
+
- `to_xml(options = {})`: Convert to XML
|
235
|
+
- Options:
|
236
|
+
- `template`: ERB template
|
237
|
+
- `root`: Root element name (defaults to 'data')
|
238
|
+
- `to_json`: Convert to JSON
|
239
|
+
- `flatten`: Flatten nested structures
|
240
|
+
- `crush`: Compress output
|
241
|
+
- `raw`: Get raw output data
|
242
|
+
|
243
|
+
#### Examples
|
244
|
+
|
245
|
+
Using ERB templates:
|
212
246
|
```ruby
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
247
|
+
# Template (test.erb)
|
248
|
+
# <names>
|
249
|
+
# <combined><%= data[:name] %> <%= data[:last_name] %></combined>
|
250
|
+
# <%= print data, :name, :first_name %>
|
251
|
+
# <%= print data, :last_name %>
|
252
|
+
# </names>
|
253
|
+
|
254
|
+
# Generate string from template
|
255
|
+
result = output.to_s("test.erb")
|
256
|
+
|
257
|
+
# Without template
|
258
|
+
result = output.to_s
|
259
|
+
```
|
218
260
|
|
261
|
+
Writing to files:
|
219
262
|
```ruby
|
220
|
-
|
263
|
+
# Write to file with template
|
264
|
+
output.to_uri(
|
265
|
+
"file://data.xml",
|
266
|
+
{template: "test.erb", content_type: "application/xml"}
|
267
|
+
)
|
268
|
+
|
269
|
+
# Write to file without template
|
270
|
+
output.to_uri("file://data.json", {content_type: "application/json"})
|
221
271
|
```
|
222
272
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
273
|
+
Creating tar archives:
|
274
|
+
```ruby
|
275
|
+
# Create tar with random name
|
276
|
+
data = output.to_uri(
|
277
|
+
"file://data.json",
|
278
|
+
{content_type: "application/json", tar: true}
|
279
|
+
)
|
280
|
+
|
281
|
+
# Create tar with specific name
|
282
|
+
data = output.to_uri(
|
283
|
+
"file://./test.json",
|
284
|
+
{
|
285
|
+
template: "test.erb",
|
286
|
+
content_type: 'application/json',
|
287
|
+
tar_name: "test.tar.gz"
|
288
|
+
}
|
289
|
+
)
|
290
|
+
```
|
228
291
|
|
229
|
-
|
292
|
+
### Filter
|
230
293
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
SUFFIX
|
294
|
+
Filter data from a hash using JSONPath.
|
295
|
+
|
296
|
+
```ruby
|
297
|
+
# Extract data using JSONPath
|
298
|
+
filtered_data = filter(data, "$..metadata.record")
|
237
299
|
```
|
238
300
|
|
239
|
-
|
301
|
+
### Rules
|
240
302
|
|
241
|
-
|
303
|
+
Rules provide a systematic way to transform data using three components:
|
304
|
+
- A destination tag
|
305
|
+
- A JSONPath filter to get the data
|
306
|
+
- A lambda function to execute on every filter hit
|
242
307
|
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
}
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
},
|
284
|
-
'
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
308
|
+
#### Example Rule Sets
|
309
|
+
|
310
|
+
```ruby
|
311
|
+
RULE_SETS = {
|
312
|
+
# Simple filter
|
313
|
+
'rs_only_filter' => {
|
314
|
+
'only_filter' => "$.title"
|
315
|
+
},
|
316
|
+
|
317
|
+
# Plain text
|
318
|
+
'rs_only_text' => {
|
319
|
+
'plain_text_tag' => {
|
320
|
+
'text' => 'hello world'
|
321
|
+
}
|
322
|
+
},
|
323
|
+
|
324
|
+
# Text with suffix
|
325
|
+
'rs_text_with_suffix' => {
|
326
|
+
'text_tag_with_suffix' => {
|
327
|
+
'text' => ['hello_world', {'suffix' => '-suffix'}]
|
328
|
+
}
|
329
|
+
},
|
330
|
+
|
331
|
+
# Map values
|
332
|
+
'rs_map_with_json_filter' => {
|
333
|
+
'language' => {
|
334
|
+
'@' => {'nl' => 'dut', 'fr' => 'fre', 'de' => 'ger', 'en' => 'eng'}
|
335
|
+
}
|
336
|
+
},
|
337
|
+
|
338
|
+
# Transform with lambda
|
339
|
+
'rs_hash_with_json_filter' => {
|
340
|
+
'multiple_of_2' => {
|
341
|
+
'@' => lambda { |d| d.to_i * 2 }
|
342
|
+
}
|
343
|
+
},
|
344
|
+
|
345
|
+
# Multiple transforms
|
346
|
+
'rs_hash_with_multiple_json_filter' => {
|
347
|
+
'multiple_of' => [
|
348
|
+
{'@' => lambda { |d| d.to_i * 2 }},
|
349
|
+
{'@' => lambda { |d| d.to_i * 3 }}
|
350
|
+
]
|
351
|
+
},
|
352
|
+
|
353
|
+
# Transform with suffix
|
354
|
+
'rs_hash_with_json_filter_and_suffix' => {
|
355
|
+
'multiple_of_with_suffix' => {
|
356
|
+
'@' => [lambda {|d| d.to_i*2}, 'suffix' => '-multiple_of_2']
|
357
|
+
}
|
358
|
+
},
|
359
|
+
|
360
|
+
# Multiple lambdas
|
361
|
+
'rs_hash_with_json_filter_and_multiple_lambdas' => {
|
362
|
+
'multiple_lambdas' => {
|
363
|
+
'@' => [lambda {|d| d.to_i*2}, lambda {|d| Math.sqrt(d.to_i) }]
|
364
|
+
}
|
365
|
+
},
|
366
|
+
|
367
|
+
# With options
|
368
|
+
'rs_hash_with_json_filter_and_option' => {
|
369
|
+
'subjects' => {
|
370
|
+
'$..subject' => [
|
371
|
+
lambda {|d,o|
|
372
|
+
{
|
373
|
+
doc_id: o['id'],
|
374
|
+
subject: d
|
294
375
|
}
|
295
|
-
|
376
|
+
}
|
377
|
+
]
|
378
|
+
}
|
379
|
+
}
|
380
|
+
}
|
296
381
|
```
|
297
382
|
|
298
|
-
|
299
|
-
***rules.run*** can have 4 parameters. First 3 are mandatory. The last one ***options*** can hold data static to a rule set or engine directives.
|
383
|
+
#### Using Rules
|
300
384
|
|
301
|
-
##### List of engine directives:
|
302
|
-
- _no_array_with_one_element: defaults to false. if the result is an array with 1 element just return the element.
|
303
|
-
|
304
|
-
###### example:
|
305
385
|
```ruby
|
306
|
-
#
|
307
|
-
|
308
|
-
|
309
|
-
|
386
|
+
# Apply rule set with options
|
387
|
+
data = {'subject' => ['water', 'thermodynamics']}
|
388
|
+
rules.run(RULE_SETS['rs_hash_with_json_filter_and_option'], data, output, {'id' => 1})
|
389
|
+
|
390
|
+
# Result:
|
391
|
+
# {
|
392
|
+
# "subjects":[
|
393
|
+
# {"doc_id":1,"subject":"water"},
|
394
|
+
# {"doc_id":1,"subject":"thermodynamics"}
|
395
|
+
# ]
|
396
|
+
# }
|
397
|
+
```
|
310
398
|
|
311
|
-
|
399
|
+
Engine directives:
|
400
|
+
- `no_array_with_one_element`: If true and result is a single-element array/hash, return just the element (default: false)
|
401
|
+
- `_no_array_with_one_literal`: If result is a single-element is in an array, return the element (default: false)
|
312
402
|
|
313
|
-
|
403
|
+
### Config
|
314
404
|
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
}
|
405
|
+
The config object points to a configuration file (default: "config.yml").
|
406
|
+
|
407
|
+
__Example__ config.yml
|
408
|
+
```yaml
|
409
|
+
cache: "/tmp"
|
410
|
+
password: ${SECRET}
|
411
|
+
active: true
|
323
412
|
```
|
324
413
|
|
414
|
+
__Usage__
|
415
|
+
```ruby
|
416
|
+
# Set config path and filename
|
417
|
+
config.path = "/path/to/my/config"
|
418
|
+
config.name = "not_my_config.yml"
|
325
419
|
|
420
|
+
# Check config
|
421
|
+
puts config.version
|
422
|
+
puts config.include?(:key)
|
423
|
+
puts config.keys
|
326
424
|
|
327
|
-
|
328
|
-
config
|
425
|
+
# Read config value
|
426
|
+
config[:active]
|
329
427
|
|
330
|
-
|
331
|
-
|
332
|
-
config[:active]
|
333
|
-
```
|
334
|
-
___write___
|
335
|
-
```ruby
|
336
|
-
config[:active] = false
|
337
|
-
```
|
338
|
-
#### log
|
339
|
-
Log to stdout
|
340
|
-
```ruby
|
341
|
-
log("hello world")
|
342
|
-
```
|
343
|
-
#### error
|
344
|
-
Log an error to stdout
|
345
|
-
```ruby
|
346
|
-
error("if you have an issue take a tissue")
|
428
|
+
# Write config value
|
429
|
+
config[:active] = false
|
347
430
|
```
|
348
|
-
|
349
|
-
|
431
|
+
|
432
|
+
### Logging
|
433
|
+
|
350
434
|
```ruby
|
351
|
-
|
352
|
-
|
353
|
-
# add multiple log outputs
|
354
|
-
logger(STDOUT, f)
|
435
|
+
# Log to stdout
|
436
|
+
log("hello world")
|
355
437
|
|
356
|
-
#
|
357
|
-
|
438
|
+
# Log error
|
439
|
+
error("if you have an issue take a tissue")
|
440
|
+
|
441
|
+
# Configure logger outputs
|
442
|
+
f = File.open('/tmp/data.log', 'w')
|
443
|
+
f.sync = true # Do not buffer
|
444
|
+
logger(STDOUT, f) # Log to both STDOUT and file
|
358
445
|
```
|
359
446
|
|
360
|
-
## Example
|
361
|
-
|
447
|
+
## Complete Example
|
448
|
+
|
449
|
+
Input data (test.csv):
|
362
450
|
```csv
|
363
451
|
sequence, data
|
364
452
|
1, apple
|
@@ -366,107 +454,86 @@ sequence, data
|
|
366
454
|
3, peach
|
367
455
|
```
|
368
456
|
|
369
|
-
Output template
|
370
|
-
```
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
457
|
+
Output template (test.erb):
|
458
|
+
```erb
|
459
|
+
<data>
|
460
|
+
<% data[:record].each do |d| %>
|
461
|
+
<record sequence="<%= d[:sequence] %>">
|
462
|
+
<%= print d, :data %>
|
463
|
+
</record>
|
464
|
+
<% end %>
|
377
465
|
</data>
|
378
466
|
```
|
379
467
|
|
468
|
+
Processing script:
|
380
469
|
```ruby
|
381
470
|
require 'data_collector'
|
382
471
|
include DataCollector::Core
|
383
472
|
|
473
|
+
# Read CSV data
|
384
474
|
data = input.from_uri('file://test.csv')
|
475
|
+
|
476
|
+
# Transform data
|
385
477
|
data.map{ |m| m[:sequence] *=2; m }
|
386
478
|
|
387
|
-
output
|
479
|
+
# Store in output
|
480
|
+
output[:record] = data
|
388
481
|
|
482
|
+
# Generate result using template
|
389
483
|
puts output.to_s('test.erb')
|
390
484
|
```
|
391
485
|
|
392
|
-
|
486
|
+
Output:
|
393
487
|
```xml
|
394
488
|
<data>
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
489
|
+
<record sequence="11">
|
490
|
+
<data> apple</data>
|
491
|
+
</record>
|
492
|
+
<record sequence="22">
|
493
|
+
<data> banana</data>
|
494
|
+
</record>
|
495
|
+
<record sequence="33">
|
496
|
+
<data> peach</data>
|
497
|
+
</record>
|
404
498
|
</data>
|
405
499
|
```
|
406
500
|
|
407
|
-
|
408
|
-
are the same the Ruby [CSV](https://docs.ruby-lang.org/en/master/CSV.html#class-CSV-label-Options) class
|
409
|
-
|
410
|
-
Loading a CSV file with **;** as the row seperator
|
411
|
-
```ruby
|
412
|
-
i = input.from_uri('https://support.staffbase.com/hc/en-us/article_attachments/360009197031/username.csv', col_sep: ';')
|
413
|
-
```
|
414
|
-
|
415
|
-
## Installation
|
416
|
-
|
417
|
-
Add this line to your application's Gemfile:
|
418
|
-
|
419
|
-
```ruby
|
420
|
-
gem 'data_collector'
|
421
|
-
```
|
422
|
-
|
423
|
-
And then execute:
|
424
|
-
|
425
|
-
$ bundle
|
426
|
-
|
427
|
-
Or install it yourself as:
|
428
|
-
|
429
|
-
$ gem install data_collector
|
430
|
-
|
431
|
-
## Usage
|
501
|
+
## Full Pipeline Example
|
432
502
|
|
433
503
|
```ruby
|
434
504
|
require 'data_collector'
|
435
505
|
|
436
506
|
include DataCollector::Core
|
437
|
-
|
507
|
+
|
508
|
+
# Define rules
|
438
509
|
RULES = {
|
439
|
-
|
510
|
+
'title' => '$..vertitle'
|
440
511
|
}
|
441
|
-
|
512
|
+
|
513
|
+
# Create a PULL pipeline and schedule it to run every 5 seconds
|
442
514
|
pipeline = DataCollector::Pipeline.new(schedule: 'PT5S')
|
443
515
|
|
444
516
|
pipeline.on_message do |input, output|
|
517
|
+
# Fetch data
|
445
518
|
data = input.from_uri('https://services3.libis.be/primo_artefact/lirias3611609')
|
519
|
+
|
520
|
+
# Apply rules
|
446
521
|
rules.run(RULES, data, output)
|
447
|
-
|
522
|
+
|
523
|
+
# Output results
|
448
524
|
puts JSON.pretty_generate(output.raw)
|
449
525
|
output.clear
|
450
526
|
|
527
|
+
# Stop after 3 runs
|
451
528
|
if pipeline.run_count > 2
|
452
529
|
log('stopping pipeline after one run')
|
453
530
|
pipeline.stop
|
454
531
|
end
|
455
532
|
end
|
456
|
-
pipeline.run
|
457
533
|
|
534
|
+
pipeline.run
|
458
535
|
```
|
459
536
|
|
460
|
-
## Development
|
461
|
-
|
462
|
-
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
463
|
-
|
464
|
-
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
465
|
-
|
466
|
-
## Contributing
|
467
|
-
|
468
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/data_collector.
|
469
|
-
|
470
537
|
## License
|
471
538
|
|
472
539
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
@@ -24,13 +24,13 @@ module DataCollector
|
|
24
24
|
when Array
|
25
25
|
odata = {}
|
26
26
|
rule.each do |sub_rule|
|
27
|
-
d=apply_rule(tag, sub_rule, input_data, output_data, options)
|
27
|
+
d = apply_rule(tag, sub_rule, input_data, output_data, options)
|
28
28
|
next if d.nil?
|
29
|
-
odata.merge!(d) {|k,v, n|
|
30
|
-
[v,n].flatten
|
29
|
+
odata.merge!(d) { |k, v, n|
|
30
|
+
[v, n].flatten
|
31
31
|
}
|
32
32
|
end
|
33
|
-
odata.each do |k,v|
|
33
|
+
odata.each do |k, v|
|
34
34
|
output_data.data[k] = v
|
35
35
|
end
|
36
36
|
return output_data
|
@@ -120,13 +120,20 @@ module DataCollector
|
|
120
120
|
|
121
121
|
output_data.compact! if output_data.is_a?(Array)
|
122
122
|
output_data.flatten! if output_data.is_a?(Array)
|
123
|
-
if
|
123
|
+
if options.with_indifferent_access.key?('_no_array_with_one_literal') &&
|
124
|
+
options.with_indifferent_access['_no_array_with_one_literal'] &&
|
125
|
+
output_data.is_a?(Array) &&
|
124
126
|
output_data.size == 1 &&
|
125
|
-
(output_data.first.is_a?(Array) || output_data.first.is_a?(Hash))
|
127
|
+
not((output_data.first.is_a?(Array) || output_data.first.is_a?(Hash)))
|
126
128
|
output_data = output_data.first
|
129
|
+
# elsif output_data.is_a?(Array) &&
|
130
|
+
# output_data.size == 1 &&
|
131
|
+
# (output_data.first.is_a?(Array) || output_data.first.is_a?(Hash))
|
132
|
+
# output_data = output_data.first
|
127
133
|
end
|
128
134
|
|
129
|
-
if options.with_indifferent_access.key?('_no_array_with_one_element') &&
|
135
|
+
if options.with_indifferent_access.key?('_no_array_with_one_element') &&
|
136
|
+
options.with_indifferent_access['_no_array_with_one_element'] &&
|
130
137
|
output_data.is_a?(Array) && output_data.size == 1
|
131
138
|
output_data = output_data.first
|
132
139
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_collector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.63.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Celik
|
@@ -407,7 +407,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
407
407
|
- !ruby/object:Gem::Version
|
408
408
|
version: '0'
|
409
409
|
requirements: []
|
410
|
-
rubygems_version: 3.6.
|
410
|
+
rubygems_version: 3.6.9
|
411
411
|
specification_version: 4
|
412
412
|
summary: ETL helper library
|
413
413
|
test_files: []
|