data_collector 0.24.0 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +18 -0
- data/data_collector.gemspec +1 -0
- data/lib/data_collector/input/rpc.rb +1 -1
- data/lib/data_collector/input.rb +16 -5
- data/lib/data_collector/pipeline.rb +16 -0
- data/lib/data_collector/version.rb +1 -1
- metadata +17 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 057dac92f5a83458b244a962ef0518fa093365a38c2388582de0556ac7b884ca
|
4
|
+
data.tar.gz: 9aebb908d52dda01dc3efce7dcda3e40c8b4f45e718cef94052c9d37fca604eb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aa91e78ec3aef0010553886539284af4aafb3ed7014ef35e6f7094377bf9ad2efb7f6025bc7f8715d4558d8a5a1242325761a12a6e696590828fa1752c579198
|
7
|
+
data.tar.gz: 7309cc34f3d022d212ab9870558909ab84be3c6550160ce822808f98edb259c96669899247a7faefec6365068ac5aa68a6d5b43f279591156f47c9f02d0e182c
|
data/README.md
CHANGED
@@ -15,6 +15,11 @@ You can set a schedule for pipelines that are triggered by new data, specifying
|
|
15
15
|
executed in the [ISO8601 duration format](https://www.digi.com/resources/documentation/digidocs//90001488-13/reference/r_iso_8601_duration_format.htm). The processing logic is then executed.
|
16
16
|
###### methods:
|
17
17
|
- .new(options): options can be schedule in [ISO8601 duration format](https://www.digi.com/resources/documentation/digidocs//90001488-13/reference/r_iso_8601_duration_format.htm) and name
|
18
|
+
- options:
|
19
|
+
- name: pipeline name
|
20
|
+
- schedule: [ISO8601 duration format](https://www.digi.com/resources/documentation/digidocs//90001488-13/reference/r_iso_8601_duration_format.htm)
|
21
|
+
- cron: in cron format ex. '1 12 * * *' intervals are not supported
|
22
|
+
- uri: a directory/file to watch
|
18
23
|
- .run: start the pipeline. blocking if a schedule is supplied
|
19
24
|
- .stop: stop the pipeline
|
20
25
|
- .pause: pause the pipeline. Restart using .run
|
@@ -37,6 +42,19 @@ end
|
|
37
42
|
pipeline.run
|
38
43
|
```
|
39
44
|
|
45
|
+
```ruby
|
46
|
+
#create a pipeline scheduled to run every morning at 06:00 am
|
47
|
+
pipeline = Pipeline.new(schedule: '0 6 * * *')
|
48
|
+
|
49
|
+
pipeline.on_message do |input, output|
|
50
|
+
data = input.from_uri("https://dummyjson.com/comments?limit=10")
|
51
|
+
# process data
|
52
|
+
end
|
53
|
+
|
54
|
+
pipeline.run
|
55
|
+
```
|
56
|
+
|
57
|
+
|
40
58
|
```ruby
|
41
59
|
#create a pipeline to listen and process files in a directory
|
42
60
|
extract = DataCollector::Pipeline.new(name: 'extract', uri: 'file://./data/in')
|
data/data_collector.gemspec
CHANGED
@@ -50,6 +50,7 @@ Gem::Specification.new do |spec|
|
|
50
50
|
spec.add_runtime_dependency 'bunny', '~> 2.20'
|
51
51
|
spec.add_runtime_dependency 'bunny_burrow', '~> 1.5'
|
52
52
|
spec.add_runtime_dependency 'builder', '~> 3.2'
|
53
|
+
spec.add_runtime_dependency 'parse-cron', '~> 0.1'
|
53
54
|
|
54
55
|
spec.add_development_dependency 'bundler', '~> 2.3'
|
55
56
|
spec.add_development_dependency 'minitest', '~> 5.18'
|
data/lib/data_collector/input.rb
CHANGED
@@ -31,7 +31,7 @@ module DataCollector
|
|
31
31
|
raise DataCollector::Error 'from_uri expects a scheme like file:// of https://' unless source =~ /:\/\//
|
32
32
|
|
33
33
|
scheme, path = source.split('://')
|
34
|
-
source="#{scheme}://#{URI.
|
34
|
+
source="#{scheme}://#{URI.encode_www_form_component(path)}"
|
35
35
|
uri = URI(source)
|
36
36
|
begin
|
37
37
|
data = nil
|
@@ -41,7 +41,7 @@ module DataCollector
|
|
41
41
|
when 'https'
|
42
42
|
data = from_https(uri, options)
|
43
43
|
when 'file'
|
44
|
-
absolute_path = File.absolute_path("#{URI.
|
44
|
+
absolute_path = File.absolute_path("#{URI.decode_www_form_component("#{uri.host}#{uri.path}")}")
|
45
45
|
if File.directory?(absolute_path)
|
46
46
|
#raise DataCollector::Error, "#{uri.host}/#{uri.path} not found" unless File.exist?("#{uri.host}/#{uri.path}")
|
47
47
|
return from_dir(uri, options)
|
@@ -80,7 +80,7 @@ module DataCollector
|
|
80
80
|
end
|
81
81
|
|
82
82
|
def from_https(uri, options = {})
|
83
|
-
uri = URI.
|
83
|
+
uri = URI.decode_www_form_component("#{uri.to_s}")
|
84
84
|
data = nil
|
85
85
|
if options.with_indifferent_access.include?(:logging) && options.with_indifferent_access[:logging]
|
86
86
|
HTTP.default_options = HTTP::Options.new(features: { logging: { logger: @logger } })
|
@@ -102,6 +102,16 @@ module DataCollector
|
|
102
102
|
http = HTTP.auth(bearer)
|
103
103
|
end
|
104
104
|
|
105
|
+
if options.key?(:cookies)
|
106
|
+
@logger.debug "Set cookies"
|
107
|
+
http = http.cookies( options[:cookies] )
|
108
|
+
end
|
109
|
+
|
110
|
+
if options.key?(:headers)
|
111
|
+
@logger.debug "Set http headers"
|
112
|
+
http = http.headers( options[:headers] )
|
113
|
+
end
|
114
|
+
|
105
115
|
if options.key?(:verify_ssl) && uri.scheme.eql?('https')
|
106
116
|
@logger.warn "Disabling SSL verification. "
|
107
117
|
#shouldn't use this but we all do ...
|
@@ -109,10 +119,11 @@ module DataCollector
|
|
109
119
|
ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
110
120
|
|
111
121
|
http_response = http.follow.get(escape_uri(uri), ssl_context: ctx)
|
122
|
+
|
112
123
|
else
|
113
124
|
http_response = http.follow.get(escape_uri(uri))
|
114
125
|
end
|
115
|
-
|
126
|
+
|
116
127
|
case http_response.code
|
117
128
|
when 200..299
|
118
129
|
@raw = data = http_response.body.to_s
|
@@ -235,7 +246,7 @@ module DataCollector
|
|
235
246
|
end
|
236
247
|
|
237
248
|
def normalize_uri(uri)
|
238
|
-
"#{URI.
|
249
|
+
"#{URI.decode_www_form_component(uri.host)}#{URI.decode_www_form_component(uri.path)}"
|
239
250
|
end
|
240
251
|
end
|
241
252
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'iso8601'
|
2
|
+
require 'parse-cron'
|
2
3
|
|
3
4
|
module DataCollector
|
4
5
|
class Pipeline
|
@@ -12,6 +13,7 @@ module DataCollector
|
|
12
13
|
@run_count = 0
|
13
14
|
|
14
15
|
@schedule = options[:schedule] || {}
|
16
|
+
@cron = options[:cron || '']
|
15
17
|
@name = options[:name] || "pipeline-#{Time.now.to_i}-#{rand(10000)}"
|
16
18
|
@options = options
|
17
19
|
@listeners = []
|
@@ -43,6 +45,20 @@ module DataCollector
|
|
43
45
|
|
44
46
|
DataCollector::Core.log("PIPELINE running in #{interval.size} seconds")
|
45
47
|
sleep interval.size
|
48
|
+
handle_on_message(@input, @output) unless paused?
|
49
|
+
end
|
50
|
+
elsif @cron && !@cron.empty?
|
51
|
+
cron_parser = CronParser.new(@cron)
|
52
|
+
while running?
|
53
|
+
@run_count += 1
|
54
|
+
start_time = ISO8601::DateTime.new(Time.now.to_datetime.to_s)
|
55
|
+
next_run = cron_parser.next(start_time.to_time)
|
56
|
+
|
57
|
+
interval = ISO8601::TimeInterval.from_datetimes(start_time, ISO8601::DateTime.new(next_run.to_datetime.to_s))
|
58
|
+
|
59
|
+
DataCollector::Core.log("PIPELINE running at #{next_run.to_datetime.strftime('%Y-%m-%dT%H:%M:%S')} or in #{interval.size} seconds")
|
60
|
+
sleep interval.size
|
61
|
+
|
46
62
|
handle_on_message(@input, @output) unless paused?
|
47
63
|
end
|
48
64
|
else # run once
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_collector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.26.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Celik
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -206,6 +206,20 @@ dependencies:
|
|
206
206
|
- - "~>"
|
207
207
|
- !ruby/object:Gem::Version
|
208
208
|
version: '3.2'
|
209
|
+
- !ruby/object:Gem::Dependency
|
210
|
+
name: parse-cron
|
211
|
+
requirement: !ruby/object:Gem::Requirement
|
212
|
+
requirements:
|
213
|
+
- - "~>"
|
214
|
+
- !ruby/object:Gem::Version
|
215
|
+
version: '0.1'
|
216
|
+
type: :runtime
|
217
|
+
prerelease: false
|
218
|
+
version_requirements: !ruby/object:Gem::Requirement
|
219
|
+
requirements:
|
220
|
+
- - "~>"
|
221
|
+
- !ruby/object:Gem::Version
|
222
|
+
version: '0.1'
|
209
223
|
- !ruby/object:Gem::Dependency
|
210
224
|
name: bundler
|
211
225
|
requirement: !ruby/object:Gem::Requirement
|
@@ -319,7 +333,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
319
333
|
- !ruby/object:Gem::Version
|
320
334
|
version: '0'
|
321
335
|
requirements: []
|
322
|
-
rubygems_version: 3.4.
|
336
|
+
rubygems_version: 3.4.13
|
323
337
|
signing_key:
|
324
338
|
specification_version: 4
|
325
339
|
summary: ETL helper library
|