data_collector 0.20.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +58 -14
- data/data_collector.gemspec +3 -1
- data/lib/data_collector/core.rb +5 -0
- data/lib/data_collector/input/dir.rb +2 -2
- data/lib/data_collector/input/generic.rb +7 -5
- data/lib/data_collector/input/queue.rb +18 -8
- data/lib/data_collector/input/rpc.rb +73 -0
- data/lib/data_collector/input.rb +19 -4
- data/lib/data_collector/output/generic.rb +38 -0
- data/lib/data_collector/output/rpc.rb +43 -0
- data/lib/data_collector/output.rb +158 -78
- data/lib/data_collector/pipeline.rb +1 -1
- data/lib/data_collector/version.rb +1 -1
- metadata +35 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 919f205875cd4bdd02d951b50fba36833743c96eb887d8b183bdb4fabcff62d2
|
4
|
+
data.tar.gz: 6feefeb606971a467dbf1cd47d39a3e76b5e23563fbd81c7be37f07f9c2f4bc2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d51898627739f4047234e617ed10e8686cbcdc76782ea9cdf78ed5c566ed18801de0938d0abe6d57d734297a066904043a29027d33b3de6ef35fbe255cf19c2b
|
7
|
+
data.tar.gz: cd05871afe0d9d9bcb698abb96292d7ed4780d41c3ddeb74db649abb7512f4c06f3991c11211cb2d574d7636de68a4fa1b23fa817943f894666fa64ff4aa31f8
|
data/README.md
CHANGED
@@ -1,15 +1,13 @@
|
|
1
1
|
# DataCollector
|
2
|
-
Convenience module to Extract, Transform and Load
|
2
|
+
Convenience module to Extract, Transform and Load data in a Pipeline.
|
3
3
|
The 'INPUT', 'OUTPUT' and 'FILTER' object will help you to read, transform and output your data.
|
4
|
-
Support objects like CONFIG, LOG, ERROR, RULES
|
4
|
+
Support objects like CONFIG, LOG, ERROR, RULES help you to write manageable rules to transform and log your data.
|
5
5
|
Include the DataCollector::Core module into your application gives you access to these objects.
|
6
6
|
```ruby
|
7
7
|
include DataCollector::Core
|
8
8
|
```
|
9
|
-
|
10
9
|
Every object can be used on its own.
|
11
10
|
|
12
|
-
|
13
11
|
#### Pipeline
|
14
12
|
Allows you to create a simple pipeline of operations to process data. With a data pipeline, you can collect, process, and transform data, and then transfer it to various systems and applications.
|
15
13
|
|
@@ -28,16 +26,29 @@ executed in the [ISO8601 duration format](https://www.digi.com/resources/documen
|
|
28
26
|
- .on_message: handle to run every time a trigger event happens
|
29
27
|
###### example:
|
30
28
|
```ruby
|
31
|
-
#create a
|
29
|
+
#create a pipeline scheduled to run every 10 minutes
|
32
30
|
pipeline = Pipeline.new(schedule: 'PT10M')
|
33
31
|
|
34
32
|
pipeline.on_message do |input, output|
|
35
|
-
|
33
|
+
data = input.from_uri("https://dummyjson.com/comments?limit=10")
|
34
|
+
# process data
|
36
35
|
end
|
37
36
|
|
38
37
|
pipeline.run
|
39
38
|
```
|
40
39
|
|
40
|
+
```ruby
|
41
|
+
#create a pipeline to listen and process files in a directory
|
42
|
+
extract = DataCollector::Pipeline.new(name: 'extract', uri: 'file://./data/in')
|
43
|
+
|
44
|
+
extract.on_message do |input, output, filename|
|
45
|
+
data = input.from_uri("file://#{filename}")
|
46
|
+
# process data
|
47
|
+
end
|
48
|
+
|
49
|
+
extract.run
|
50
|
+
```
|
51
|
+
|
41
52
|
#### input
|
42
53
|
The input component is part of the processing logic. All data is converted into a Hash, Array, ... accessible using plain Ruby or JSONPath using the filter object.
|
43
54
|
The input component can fetch data from various URIs, such as files, URLs, directories, queues, ...
|
@@ -60,18 +71,18 @@ A push happens when new data is created in a directory, message queue, ...
|
|
60
71
|
input.from_uri("http://www.libis.be/record.jsonld", content_type: 'application/ld+json')
|
61
72
|
|
62
73
|
# read data from a RabbitMQ queue
|
63
|
-
listener = input.from_uri('amqp://user:password@localhost?channel=hello')
|
74
|
+
listener = input.from_uri('amqp://user:password@localhost?channel=hello&queue=world')
|
64
75
|
listener.on_message do |input, output, message|
|
65
76
|
puts message
|
66
77
|
end
|
67
|
-
listener.
|
78
|
+
listener.run
|
68
79
|
|
69
80
|
# read data from a directory
|
70
81
|
listener = input.from_uri('file://this/is/directory')
|
71
82
|
listener.on_message do |input, output, filename|
|
72
83
|
puts filename
|
73
84
|
end
|
74
|
-
listener.
|
85
|
+
listener.run
|
75
86
|
```
|
76
87
|
|
77
88
|
Inputs can be JSON, XML or CSV or XML in a TAR.GZ file
|
@@ -94,6 +105,30 @@ Output is an object you can store key/value pairs that needs to be written to an
|
|
94
105
|
output[:last_name] = 'Doe'
|
95
106
|
```
|
96
107
|
|
108
|
+
```ruby
|
109
|
+
# get all keys from the output object
|
110
|
+
output.keys
|
111
|
+
output.key?(:name)
|
112
|
+
output.each do |k,v|
|
113
|
+
puts "#{k}:#{v}"
|
114
|
+
end
|
115
|
+
```
|
116
|
+
```ruby
|
117
|
+
# add hash to output
|
118
|
+
output << { age: 22 }
|
119
|
+
|
120
|
+
puts output[:age]
|
121
|
+
# # 22
|
122
|
+
```
|
123
|
+
```ruby
|
124
|
+
# add array to output
|
125
|
+
output << [1,2,3,4]
|
126
|
+
puts output.keys
|
127
|
+
# # datap
|
128
|
+
puts output['datap']
|
129
|
+
# # [1, 2, 3, 4]
|
130
|
+
```
|
131
|
+
|
97
132
|
Write output to a file, string use an ERB file as a template
|
98
133
|
example:
|
99
134
|
___test.erb___
|
@@ -116,25 +151,34 @@ will produce
|
|
116
151
|
Into a variable
|
117
152
|
```ruby
|
118
153
|
result = output.to_s("test.erb")
|
154
|
+
#template is optional
|
155
|
+
result = output.to_s
|
119
156
|
```
|
120
157
|
|
121
|
-
Into a file
|
158
|
+
Into a file
|
122
159
|
```ruby
|
123
|
-
output.
|
160
|
+
output.to_uri("file://data.xml", {template: "test.erb", content_type: "application/xml"})
|
161
|
+
#template is optional
|
162
|
+
output.to_uri("file://data.json", {content_type: "application/json"})
|
124
163
|
```
|
125
164
|
|
126
165
|
Into a tar file stored in data
|
127
166
|
```ruby
|
128
|
-
|
167
|
+
# create a tar file with a random name
|
168
|
+
data = output.to_uri("file://data.json", {content_type: "application/json", tar:true})
|
169
|
+
#choose
|
170
|
+
data = output.to_uri("file://./test.json", {template: "test.erb", content_type: 'application/json', tar_name: "test.tar.gz"})
|
129
171
|
```
|
130
172
|
|
131
173
|
Other output methods
|
132
174
|
```ruby
|
133
175
|
output.raw
|
134
176
|
output.clear
|
135
|
-
output.
|
136
|
-
output.
|
177
|
+
output.to_xml(template: 'test.erb', root: 'record') # root defaults to 'data'
|
178
|
+
output.to_json
|
137
179
|
output.flatten
|
180
|
+
output.crush
|
181
|
+
output.keys
|
138
182
|
```
|
139
183
|
|
140
184
|
Into a temp directory
|
data/data_collector.gemspec
CHANGED
@@ -43,11 +43,13 @@ Gem::Specification.new do |spec|
|
|
43
43
|
spec.add_runtime_dependency 'jsonpath', '~> 1.1'
|
44
44
|
spec.add_runtime_dependency 'mime-types', '~> 3.4'
|
45
45
|
spec.add_runtime_dependency 'minitar', '= 0.9'
|
46
|
-
spec.add_runtime_dependency 'nokogiri', '~> 1.
|
46
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.15'
|
47
47
|
spec.add_runtime_dependency 'nori', '~> 2.6'
|
48
48
|
spec.add_runtime_dependency 'iso8601', '~> 0.13'
|
49
49
|
spec.add_runtime_dependency 'listen', '~> 3.8'
|
50
50
|
spec.add_runtime_dependency 'bunny', '~> 2.20'
|
51
|
+
spec.add_runtime_dependency 'bunny_burrow', '~> 1.5'
|
52
|
+
spec.add_runtime_dependency 'builder', '~> 3.2'
|
51
53
|
|
52
54
|
spec.add_development_dependency 'bundler', '~> 2.3'
|
53
55
|
spec.add_development_dependency 'minitest', '~> 5.18'
|
data/lib/data_collector/core.rb
CHANGED
@@ -4,7 +4,7 @@ require 'listen'
|
|
4
4
|
module DataCollector
|
5
5
|
class Input
|
6
6
|
class Dir < Generic
|
7
|
-
def initialize(uri, options)
|
7
|
+
def initialize(uri, options = {})
|
8
8
|
super
|
9
9
|
end
|
10
10
|
|
@@ -18,7 +18,7 @@ module DataCollector
|
|
18
18
|
@listener ||= Listen.to("#{@uri.host}/#{@uri.path}", @options) do |modified, added, _|
|
19
19
|
files = added | modified
|
20
20
|
files.each do |filename|
|
21
|
-
handle_on_message(input, output, filename)
|
21
|
+
handle_on_message(@input, @output, filename)
|
22
22
|
end
|
23
23
|
end
|
24
24
|
end
|
@@ -3,19 +3,21 @@ require 'listen'
|
|
3
3
|
module DataCollector
|
4
4
|
class Input
|
5
5
|
class Generic
|
6
|
-
def initialize(uri, options)
|
6
|
+
def initialize(uri, options = {})
|
7
7
|
@uri = uri
|
8
8
|
@options = options
|
9
|
+
@running = false
|
9
10
|
|
10
11
|
@input = DataCollector::Input.new
|
11
12
|
@output = DataCollector::Output.new
|
12
13
|
|
13
|
-
@
|
14
|
+
@name = options[:name] || "input-#{Time.now.to_i}-#{rand(10000)}"
|
15
|
+
create_listener
|
14
16
|
end
|
15
17
|
|
16
18
|
def run(should_block = false, &block)
|
17
19
|
raise DataCollector::Error, 'Please supply a on_message block' if @on_message_callback.nil?
|
18
|
-
@
|
20
|
+
@running = true
|
19
21
|
|
20
22
|
if should_block
|
21
23
|
while running?
|
@@ -37,11 +39,11 @@ module DataCollector
|
|
37
39
|
end
|
38
40
|
|
39
41
|
def running?
|
40
|
-
@
|
42
|
+
@running
|
41
43
|
end
|
42
44
|
|
43
45
|
def stopped?
|
44
|
-
@
|
46
|
+
@running == false
|
45
47
|
end
|
46
48
|
|
47
49
|
def paused?
|
@@ -1,15 +1,15 @@
|
|
1
1
|
require_relative 'generic'
|
2
2
|
require 'bunny'
|
3
3
|
require 'active_support/core_ext/hash'
|
4
|
+
require 'ostruct'
|
4
5
|
|
5
6
|
module DataCollector
|
6
7
|
class Input
|
7
8
|
class Queue < Generic
|
8
|
-
def initialize(uri, options)
|
9
|
+
def initialize(uri, options = {})
|
9
10
|
super
|
10
11
|
|
11
12
|
if running?
|
12
|
-
create_channel unless @channel
|
13
13
|
create_queue unless @queue
|
14
14
|
end
|
15
15
|
end
|
@@ -18,9 +18,9 @@ module DataCollector
|
|
18
18
|
@listener.open?
|
19
19
|
end
|
20
20
|
|
21
|
-
def send(message)
|
21
|
+
def send(route, message)
|
22
22
|
if running?
|
23
|
-
@
|
23
|
+
@exchange.publish(message, routing_key: route)
|
24
24
|
end
|
25
25
|
end
|
26
26
|
|
@@ -37,6 +37,15 @@ module DataCollector
|
|
37
37
|
end
|
38
38
|
end
|
39
39
|
|
40
|
+
def create_exchange
|
41
|
+
@exchange ||= begin
|
42
|
+
options = CGI.parse(@uri.query).with_indifferent_access
|
43
|
+
raise DataCollector::Error, '"channel" query parameter missing from uri.' unless options.include?(:channel)
|
44
|
+
create_channel
|
45
|
+
@channel.topic(options[:channel].first, auto_delete: true)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
40
49
|
def create_channel
|
41
50
|
raise DataCollector::Error, 'Connection to RabbitMQ is closed' if @listener.closed?
|
42
51
|
@channel ||= @listener.create_channel
|
@@ -45,11 +54,12 @@ module DataCollector
|
|
45
54
|
def create_queue
|
46
55
|
@queue ||= begin
|
47
56
|
options = CGI.parse(@uri.query).with_indifferent_access
|
48
|
-
raise DataCollector::Error, '"
|
49
|
-
|
57
|
+
raise DataCollector::Error, '"queue" query parameter missing from uri.' unless options.include?(:queue)
|
58
|
+
create_exchange
|
59
|
+
queue = @channel.queue(options[:queue].first, auto_delete: true).bind(@exchange, routing_key: "#{options[:queue].first}.#")
|
50
60
|
|
51
|
-
queue.subscribe do |delivery_info, metadata, payload|
|
52
|
-
handle_on_message(input, output, payload)
|
61
|
+
queue.subscribe(consumer_tag: @name) do |delivery_info, metadata, payload|
|
62
|
+
handle_on_message(@input, @output, OpenStruct.new(info: delivery_info, properties: metadata, body: payload))
|
53
63
|
end if queue
|
54
64
|
|
55
65
|
queue
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require_relative 'generic'
|
2
|
+
require 'bunny_burrow'
|
3
|
+
require 'active_support/core_ext/hash'
|
4
|
+
require 'ostruct'
|
5
|
+
require 'securerandom'
|
6
|
+
require 'thread'
|
7
|
+
|
8
|
+
module DataCollector
|
9
|
+
class Input
|
10
|
+
class Rpc < Generic
|
11
|
+
def initialize(uri, options = {})
|
12
|
+
super
|
13
|
+
end
|
14
|
+
|
15
|
+
def running?
|
16
|
+
@running
|
17
|
+
end
|
18
|
+
|
19
|
+
def stop
|
20
|
+
if running?
|
21
|
+
@listener.shutdown
|
22
|
+
@running = false
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def pause
|
27
|
+
raise "PAUSE not implemented."
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
|
32
|
+
def run(should_block = false, &block)
|
33
|
+
@listener.subscribe(@bunny_queue) do |payload|
|
34
|
+
payload = JSON.parse(payload)
|
35
|
+
response = BunnyBurrow::Server.create_response
|
36
|
+
response[:data] = handle_on_message(@input, @output, payload)
|
37
|
+
|
38
|
+
response
|
39
|
+
end
|
40
|
+
@running = true
|
41
|
+
|
42
|
+
if should_block
|
43
|
+
while running?
|
44
|
+
yield block if block_given?
|
45
|
+
@listener.wait
|
46
|
+
end
|
47
|
+
else
|
48
|
+
yield block if block_given?
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
def create_listener
|
54
|
+
@listener ||= BunnyBurrow::Server.new do |server|
|
55
|
+
parse_uri
|
56
|
+
server.rabbitmq_url = @bunny_uri.to_s
|
57
|
+
server.rabbitmq_exchange = @bunny_channel
|
58
|
+
server.logger = DataCollector::Core.logger
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def parse_uri
|
63
|
+
raise 'URI must be of format rpc+amqp://user:password@host/exchange/queue' unless @uri.path =~ /\// && @uri.path.split('/').length == 3
|
64
|
+
|
65
|
+
@bunny_channel = @uri.path.split('/')[1]
|
66
|
+
@bunny_queue = @uri.path.split('/')[2]
|
67
|
+
@bunny_uri = @uri.clone
|
68
|
+
@bunny_uri.path=''
|
69
|
+
@bunny_uri.scheme='amqp'
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
data/lib/data_collector/input.rb
CHANGED
@@ -14,6 +14,7 @@ require 'minitar'
|
|
14
14
|
require 'csv'
|
15
15
|
require_relative 'input/dir'
|
16
16
|
require_relative 'input/queue'
|
17
|
+
require_relative 'input/rpc'
|
17
18
|
|
18
19
|
#require_relative 'ext/xml_utility_node'
|
19
20
|
module DataCollector
|
@@ -26,7 +27,7 @@ module DataCollector
|
|
26
27
|
|
27
28
|
def from_uri(source, options = {})
|
28
29
|
source = CGI.unescapeHTML(source)
|
29
|
-
@logger.info("
|
30
|
+
@logger.info("Reading #{source}")
|
30
31
|
uri = URI(source)
|
31
32
|
begin
|
32
33
|
data = nil
|
@@ -43,8 +44,12 @@ module DataCollector
|
|
43
44
|
raise DataCollector::Error, "#{uri.host}/#{uri.path} not found" unless File.exist?("#{uri.host}/#{uri.path}")
|
44
45
|
data = from_file(uri, options)
|
45
46
|
end
|
46
|
-
when
|
47
|
-
|
47
|
+
when /amqp/
|
48
|
+
if uri.scheme =~ /^rpc/
|
49
|
+
data = from_rpc(uri, options)
|
50
|
+
else
|
51
|
+
data = from_queue(uri, options)
|
52
|
+
end
|
48
53
|
else
|
49
54
|
raise "Do not know how to process #{source}"
|
50
55
|
end
|
@@ -103,7 +108,7 @@ module DataCollector
|
|
103
108
|
end
|
104
109
|
|
105
110
|
case http_response.code
|
106
|
-
when 200
|
111
|
+
when 200..299
|
107
112
|
@raw = data = http_response.body.to_s
|
108
113
|
|
109
114
|
# File.open("#{rand(1000)}.xml", 'wb') do |f|
|
@@ -130,14 +135,20 @@ module DataCollector
|
|
130
135
|
data = xml_to_hash(data)
|
131
136
|
end
|
132
137
|
end
|
138
|
+
|
139
|
+
raise '206 Partial Content' if http_response.code ==206
|
140
|
+
|
133
141
|
when 401
|
134
142
|
raise 'Unauthorized'
|
143
|
+
when 403
|
144
|
+
raise 'Forbidden'
|
135
145
|
when 404
|
136
146
|
raise 'Not found'
|
137
147
|
else
|
138
148
|
raise "Unable to process received status code = #{http_response.code}"
|
139
149
|
end
|
140
150
|
|
151
|
+
#[data, http_response.code]
|
141
152
|
data
|
142
153
|
end
|
143
154
|
|
@@ -178,6 +189,10 @@ module DataCollector
|
|
178
189
|
DataCollector::Input::Queue.new(uri, options)
|
179
190
|
end
|
180
191
|
|
192
|
+
def from_rpc(uri, options = {})
|
193
|
+
DataCollector::Input::Rpc.new(uri, options)
|
194
|
+
end
|
195
|
+
|
181
196
|
def xml_to_hash(data)
|
182
197
|
#gsub('<\/', '< /') outherwise wrong XML-parsing (see records lirias1729192 )
|
183
198
|
data = data.gsub /</, '< /'
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'bunny_burrow'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
module DataCollector
|
5
|
+
class Output
|
6
|
+
class Generic
|
7
|
+
def initialize(uri, options = {})
|
8
|
+
@uri = uri
|
9
|
+
@options = options
|
10
|
+
@running = false
|
11
|
+
|
12
|
+
create_producer
|
13
|
+
end
|
14
|
+
|
15
|
+
def send(message)
|
16
|
+
raise DataCollector::Error, 'Please implement a producer'
|
17
|
+
end
|
18
|
+
|
19
|
+
def running?
|
20
|
+
@running
|
21
|
+
end
|
22
|
+
|
23
|
+
def stopped?
|
24
|
+
@running == false
|
25
|
+
end
|
26
|
+
|
27
|
+
def stop
|
28
|
+
@running = false
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
def create_producer
|
33
|
+
raise DataCollector::Error, 'Please implement a producer'
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require_relative './generic'
|
2
|
+
|
3
|
+
module DataCollector
|
4
|
+
class Output
|
5
|
+
class Rpc < Generic
|
6
|
+
def initialize(uri, options = {})
|
7
|
+
super
|
8
|
+
end
|
9
|
+
|
10
|
+
def send(message)
|
11
|
+
raise DataCollector::Error, "No client found" if @producer.nil? || stopped?
|
12
|
+
JSON.parse(@producer.publish(message, @bunny_queue))
|
13
|
+
end
|
14
|
+
|
15
|
+
def stop
|
16
|
+
if @producer && @running
|
17
|
+
@running = false
|
18
|
+
@producer.shutdown
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
def create_producer
|
24
|
+
@producer ||= BunnyBurrow::Client.new do |client|
|
25
|
+
parse_uri
|
26
|
+
client.rabbitmq_url = @bunny_uri.to_s
|
27
|
+
client.rabbitmq_exchange = @bunny_channel
|
28
|
+
client.logger = DataCollector::Core.logger
|
29
|
+
@running = true
|
30
|
+
end
|
31
|
+
end
|
32
|
+
def parse_uri
|
33
|
+
raise 'URI must be of format rpc+amqp://user:password@host/exchange/queue' unless @uri.path =~ /\// && @uri.path.split('/').length == 3
|
34
|
+
|
35
|
+
@bunny_channel = @uri.path.split('/')[1]
|
36
|
+
@bunny_queue = @uri.path.split('/')[2]
|
37
|
+
@bunny_uri = @uri.clone
|
38
|
+
@bunny_uri.path=''
|
39
|
+
@bunny_uri.scheme='amqp'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
#encoding: UTF-8
|
1
|
+
# encoding: UTF-8
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'erb'
|
4
4
|
require 'date'
|
@@ -7,48 +7,51 @@ require 'zlib'
|
|
7
7
|
require 'cgi'
|
8
8
|
require 'active_support/core_ext/hash'
|
9
9
|
require 'fileutils'
|
10
|
+
require_relative './output/rpc'
|
10
11
|
|
11
12
|
module DataCollector
|
12
13
|
class Output
|
13
14
|
include Enumerable
|
14
|
-
attr_reader :data
|
15
|
+
attr_reader :data
|
15
16
|
|
16
17
|
def initialize(data = {})
|
17
|
-
@data = data
|
18
|
+
@data = HashWithIndifferentAccess.new(data)
|
18
19
|
@logger = Logger.new(STDOUT)
|
19
20
|
end
|
20
21
|
|
21
|
-
def each
|
22
|
-
|
23
|
-
|
22
|
+
def each(&block)
|
23
|
+
if block_given?
|
24
|
+
@data.each(&block) if @data
|
25
|
+
else
|
26
|
+
to_enum(:each)
|
24
27
|
end
|
25
28
|
end
|
26
29
|
|
27
30
|
def [](k, v = nil)
|
28
|
-
data[k]
|
31
|
+
@data[k]
|
29
32
|
end
|
30
33
|
|
31
34
|
def []=(k, v = nil)
|
32
35
|
unless v.nil?
|
33
|
-
if data.has_key?(k)
|
34
|
-
if data[k].is_a?(Array) then
|
36
|
+
if @data.has_key?(k)
|
37
|
+
if @data[k].is_a?(Array) then
|
35
38
|
if v.is_a?(Array)
|
36
|
-
data[k] += v
|
39
|
+
@data[k] += v
|
37
40
|
else
|
38
|
-
data[k] << v
|
41
|
+
@data[k] << v
|
39
42
|
end
|
40
43
|
else
|
41
|
-
data[k] = v
|
44
|
+
@data[k] = v
|
42
45
|
# HELP: why am I creating an array here?
|
43
46
|
# t = data[k]
|
44
47
|
# data[k] = Array.new([t, v])
|
45
48
|
end
|
46
49
|
else
|
47
|
-
data[k] = v
|
50
|
+
@data[k] = v
|
48
51
|
end
|
49
52
|
end
|
50
53
|
|
51
|
-
data
|
54
|
+
@data
|
52
55
|
end
|
53
56
|
|
54
57
|
def <<(input_data)
|
@@ -57,7 +60,7 @@ module DataCollector
|
|
57
60
|
self[k] = input_data[k]
|
58
61
|
end
|
59
62
|
elsif input_data.is_a?(Array)
|
60
|
-
data["datap"] = [] unless @data.has_key?("datap")
|
63
|
+
@data["datap"] = [] unless @data.has_key?("datap")
|
61
64
|
d = @data["datap"].flatten.compact
|
62
65
|
d += input_data
|
63
66
|
@data["datap"] = d.compact.flatten
|
@@ -84,6 +87,14 @@ module DataCollector
|
|
84
87
|
@data
|
85
88
|
end
|
86
89
|
|
90
|
+
def flatten()
|
91
|
+
out = Hash.new
|
92
|
+
@data.each do |m|
|
93
|
+
out[m[0]] = m[1]
|
94
|
+
end
|
95
|
+
out
|
96
|
+
end
|
97
|
+
|
87
98
|
def crush
|
88
99
|
data = @data
|
89
100
|
@data = deep_compact(data)
|
@@ -91,11 +102,10 @@ module DataCollector
|
|
91
102
|
|
92
103
|
def clear
|
93
104
|
@data = {}
|
94
|
-
#GC.start(full_mark: true, immediate_sweep: true)
|
105
|
+
# GC.start(full_mark: true, immediate_sweep: true)
|
95
106
|
GC.start
|
96
107
|
end
|
97
108
|
|
98
|
-
|
99
109
|
def to_s(erb_file = nil)
|
100
110
|
data = @data
|
101
111
|
|
@@ -149,104 +159,174 @@ module DataCollector
|
|
149
159
|
|
150
160
|
data[:response_date] = DateTime.now.xmlschema
|
151
161
|
|
152
|
-
|
153
|
-
|
154
|
-
result
|
162
|
+
ERB.new(File.read(erb_file), 0, '>').result(binding)
|
155
163
|
rescue Exception => e
|
156
164
|
raise "unable to transform to text: #{e.message}"
|
157
165
|
end
|
158
166
|
|
159
167
|
def to_tmp_file(erb_file, records_dir)
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
end
|
165
|
-
|
166
|
-
unless File.directory?(records_dir)
|
167
|
-
FileUtils.mkdir_p(records_dir)
|
168
|
-
end
|
168
|
+
raise '[DEPRECATED] `to_tmp_file` deprecated. Please use `to_uri("file://abc.xml", {template: "template.erb", content_type: "application/xml"})` instead'
|
169
|
+
rescue Exception => e
|
170
|
+
raise "unable to save to file: #{e.message}"
|
171
|
+
end
|
169
172
|
|
170
|
-
|
173
|
+
def to_tar_file(erb_file, tar_file_name = nil)
|
174
|
+
raise '[DEPRECATED] `to_tar_file` deprecated. Please use `to_uri("file://abc.xml", {content_type: "application/xml", tar: true})` instead'
|
175
|
+
rescue Exception => e
|
176
|
+
raise "unable to save to file: #{e.message}"
|
177
|
+
end
|
171
178
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
179
|
+
def to_jsonfile (jsondata, jsonfile)
|
180
|
+
raise '[DEPRECATED] `to_jsonfile` deprecated. Please use `to_uri("file://abc.json", {template: "template.erb", content_type: "application/json"})` instead'
|
181
|
+
rescue Exception => e
|
182
|
+
raise "unable to save to jsonfile: #{e.message}"
|
176
183
|
end
|
177
184
|
|
178
|
-
def
|
179
|
-
|
180
|
-
|
185
|
+
def to_uri(destination, options = {})
|
186
|
+
destination = CGI.unescapeHTML(destination)
|
187
|
+
@logger.info("writing #{destination}")
|
188
|
+
uri = URI(destination)
|
189
|
+
begin
|
190
|
+
data = nil
|
191
|
+
case uri.scheme
|
192
|
+
when 'http'
|
193
|
+
data = to_http(uri, options)
|
194
|
+
when 'https'
|
195
|
+
data = to_https(uri, options)
|
196
|
+
when 'file'
|
197
|
+
data = to_file(uri, options)
|
198
|
+
when /amqp/
|
199
|
+
if uri.scheme =~ /^rpc/
|
200
|
+
data = to_rpc(uri, options)
|
201
|
+
else
|
202
|
+
data = to_queue(uri, options)
|
203
|
+
end
|
204
|
+
else
|
205
|
+
raise "Do not know how to process #{source}"
|
206
|
+
end
|
181
207
|
|
182
|
-
|
183
|
-
config.noblanks
|
184
|
-
end
|
208
|
+
data = data.nil? ? 'no data found' : data
|
185
209
|
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
210
|
+
if block_given?
|
211
|
+
yield data
|
212
|
+
else
|
213
|
+
data
|
190
214
|
end
|
215
|
+
rescue => e
|
216
|
+
@logger.info(e.message)
|
217
|
+
puts e.backtrace.join("\n")
|
218
|
+
nil
|
219
|
+
end
|
220
|
+
end
|
191
221
|
|
192
|
-
|
222
|
+
def to_xml(options = {})
|
223
|
+
if options.key?(:template)
|
224
|
+
result = to_s(options[:template])
|
225
|
+
xml_result = Nokogiri::XML(result, nil, 'UTF-8') do |config|
|
226
|
+
config.noblanks
|
227
|
+
end
|
193
228
|
else
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
f.tar.add_file_simple("#{id}_#{rand(1000)}.xml", data: xml_data, size: xml_data.size, mtime: Time.now.to_i)
|
229
|
+
xml_root = options[:root] || 'data'
|
230
|
+
xml_result = Nokogiri::XML(@data.to_xml(root: xml_root), nil, 'UTF-8') do |config|
|
231
|
+
config.noblanks
|
198
232
|
end
|
199
|
-
|
200
|
-
return tar_file_name
|
201
233
|
end
|
202
234
|
|
203
|
-
|
204
|
-
raise "unable to save to file: #{e.message}"
|
235
|
+
xml_result.to_s
|
205
236
|
end
|
206
237
|
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
238
|
+
def to_json(options = {})
|
239
|
+
if options.key?(:template)
|
240
|
+
result = to_s(options[:template])
|
241
|
+
else
|
242
|
+
result = @data
|
212
243
|
end
|
213
|
-
|
214
|
-
|
244
|
+
|
245
|
+
result.to_json
|
215
246
|
end
|
216
247
|
|
217
|
-
def
|
218
|
-
|
219
|
-
@data.each do |m|
|
220
|
-
out[m[0]] = m[1]
|
221
|
-
end
|
222
|
-
out
|
248
|
+
def to_rpc(uri, options = {})
|
249
|
+
DataCollector::Output::Rpc.new(uri, options)
|
223
250
|
end
|
224
251
|
|
252
|
+
def to_queueto_rpc(uri, options = {})
|
253
|
+
raise "to be implemented"
|
254
|
+
end
|
255
|
+
|
256
|
+
private
|
257
|
+
|
225
258
|
def deep_compact( data )
|
226
259
|
if data.is_a?(Hash)
|
227
|
-
#puts " - Hash - #{data}"
|
260
|
+
# puts " - Hash - #{data}"
|
228
261
|
data.compact!
|
229
262
|
data.each { |k, v| data[k] = deep_compact(v) }
|
230
263
|
data.compact!
|
231
|
-
data
|
232
264
|
elsif data.is_a?(Array)
|
233
|
-
#puts " - Array - #{data}"
|
234
|
-
data.
|
235
|
-
data.
|
265
|
+
# puts " - Array - #{data}"
|
266
|
+
data.map! { |v| deep_compact(v) }
|
267
|
+
data.compact!
|
236
268
|
#puts " - Array size- #{data.size}"
|
237
269
|
data.size == 1 ? data[0] : data
|
238
270
|
elsif data.is_a?(String)
|
239
|
-
#puts " - String - #{data}"
|
240
|
-
data.blank? ? nil : data
|
271
|
+
# puts " - String - #{data}"
|
272
|
+
data.strip.blank? ? nil : data
|
241
273
|
else
|
242
274
|
data
|
243
275
|
end
|
244
276
|
end
|
245
|
-
|
246
|
-
private
|
247
277
|
|
248
|
-
def
|
249
|
-
|
278
|
+
def to_http(uri, options)
|
279
|
+
to_https(uri, options)
|
250
280
|
end
|
281
|
+
|
282
|
+
def to_https(uri, options)
|
283
|
+
|
284
|
+
raise 'TODO'
|
285
|
+
end
|
286
|
+
|
287
|
+
def to_file(uri, options)
|
288
|
+
file_type = options[:content_type] || 'application/octet-stream'
|
289
|
+
file_name = options[:name] || "#{uri.host}#{uri.path}" || nil
|
290
|
+
tar_file_name = options[:tar_name] || "#{Time.now.to_i}_#{rand(1000)}.tar.gz"
|
291
|
+
tar = options[:tar] || options.key?(:tar_name) || false
|
292
|
+
result = ''
|
293
|
+
|
294
|
+
case file_type
|
295
|
+
when 'application/json'
|
296
|
+
result = to_json(options)
|
297
|
+
file_name = "#{Time.now.to_i}_#{rand(1000)}.json" if file_name.nil?
|
298
|
+
when 'application/xml'
|
299
|
+
result = to_xml(options)
|
300
|
+
file_name = "#{Time.now.to_i}_#{rand(1000)}.xml" if file_name.nil?
|
301
|
+
else
|
302
|
+
file_name = "#{Time.now.to_i}_#{rand(1000)}.txt" if file_name.nil?
|
303
|
+
result = @data.to_json
|
304
|
+
end
|
305
|
+
|
306
|
+
if tar
|
307
|
+
#tar_file = Zlib::GzipWriter.new(File.open("#{tar_file_name}", 'wb'))
|
308
|
+
tar_file = File.open("#{tar_file_name}", 'wb')
|
309
|
+
|
310
|
+
Minitar::Output.tar(tar_file) do |f|
|
311
|
+
f.add_file_simple("#{file_name}", {size: result.size, mtime: Time.now.to_i, data: result})
|
312
|
+
end
|
313
|
+
else
|
314
|
+
file_name_absolute_path = File.absolute_path(file_name)
|
315
|
+
file_directory = File.dirname(file_name_absolute_path)
|
316
|
+
|
317
|
+
unless File.directory?(file_directory)
|
318
|
+
FileUtils.mkdir_p(file_directory)
|
319
|
+
end
|
320
|
+
|
321
|
+
File.open(file_name_absolute_path, 'wb:UTF-8') do |f|
|
322
|
+
f.puts result
|
323
|
+
end
|
324
|
+
end
|
325
|
+
|
326
|
+
result
|
327
|
+
rescue StandardError => e
|
328
|
+
raise "Unable to save data: #{e.message}"
|
329
|
+
end
|
330
|
+
|
251
331
|
end
|
252
|
-
end
|
332
|
+
end
|
@@ -12,7 +12,7 @@ module DataCollector
|
|
12
12
|
@run_count = 0
|
13
13
|
|
14
14
|
@schedule = options[:schedule] || {}
|
15
|
-
@name = options[:name] || "
|
15
|
+
@name = options[:name] || "pipeline-#{Time.now.to_i}-#{rand(10000)}"
|
16
16
|
@options = options
|
17
17
|
@listeners = []
|
18
18
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_collector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.21.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Celik
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -114,14 +114,14 @@ dependencies:
|
|
114
114
|
requirements:
|
115
115
|
- - "~>"
|
116
116
|
- !ruby/object:Gem::Version
|
117
|
-
version: '1.
|
117
|
+
version: '1.15'
|
118
118
|
type: :runtime
|
119
119
|
prerelease: false
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
121
|
requirements:
|
122
122
|
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
|
-
version: '1.
|
124
|
+
version: '1.15'
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
126
|
name: nori
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -178,6 +178,34 @@ dependencies:
|
|
178
178
|
- - "~>"
|
179
179
|
- !ruby/object:Gem::Version
|
180
180
|
version: '2.20'
|
181
|
+
- !ruby/object:Gem::Dependency
|
182
|
+
name: bunny_burrow
|
183
|
+
requirement: !ruby/object:Gem::Requirement
|
184
|
+
requirements:
|
185
|
+
- - "~>"
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
version: '1.5'
|
188
|
+
type: :runtime
|
189
|
+
prerelease: false
|
190
|
+
version_requirements: !ruby/object:Gem::Requirement
|
191
|
+
requirements:
|
192
|
+
- - "~>"
|
193
|
+
- !ruby/object:Gem::Version
|
194
|
+
version: '1.5'
|
195
|
+
- !ruby/object:Gem::Dependency
|
196
|
+
name: builder
|
197
|
+
requirement: !ruby/object:Gem::Requirement
|
198
|
+
requirements:
|
199
|
+
- - "~>"
|
200
|
+
- !ruby/object:Gem::Version
|
201
|
+
version: '3.2'
|
202
|
+
type: :runtime
|
203
|
+
prerelease: false
|
204
|
+
version_requirements: !ruby/object:Gem::Requirement
|
205
|
+
requirements:
|
206
|
+
- - "~>"
|
207
|
+
- !ruby/object:Gem::Version
|
208
|
+
version: '3.2'
|
181
209
|
- !ruby/object:Gem::Dependency
|
182
210
|
name: bundler
|
183
211
|
requirement: !ruby/object:Gem::Requirement
|
@@ -259,7 +287,10 @@ files:
|
|
259
287
|
- lib/data_collector/input/dir.rb
|
260
288
|
- lib/data_collector/input/generic.rb
|
261
289
|
- lib/data_collector/input/queue.rb
|
290
|
+
- lib/data_collector/input/rpc.rb
|
262
291
|
- lib/data_collector/output.rb
|
292
|
+
- lib/data_collector/output/generic.rb
|
293
|
+
- lib/data_collector/output/rpc.rb
|
263
294
|
- lib/data_collector/pipeline.rb
|
264
295
|
- lib/data_collector/rules.rb
|
265
296
|
- lib/data_collector/rules.rb.depricated
|