data_collector 0.24.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7745e79eb3836ab3c469cc5da39f395d42c144940b98115c96058fb01f8a629c
4
- data.tar.gz: 77aeb246a6a23477d07195c020091999825af21c9639c0a9679017daefeea9e9
3
+ metadata.gz: 057dac92f5a83458b244a962ef0518fa093365a38c2388582de0556ac7b884ca
4
+ data.tar.gz: 9aebb908d52dda01dc3efce7dcda3e40c8b4f45e718cef94052c9d37fca604eb
5
5
  SHA512:
6
- metadata.gz: e48596ac6e5fc14be89c2aadc50416558fa4a08594d28cccee630c1157dc365a556999c82d303e4669c94e1db88d6b0cf3f044730d0057586220a6c3172b72a6
7
- data.tar.gz: 2112dfa9191e8aa948317a16d581a9ea487327030cb013b43937757c144c33ca4fc975faee91fd6731d8f043ed0cefd5ecc28a562351c5de9b96196167871e36
6
+ metadata.gz: aa91e78ec3aef0010553886539284af4aafb3ed7014ef35e6f7094377bf9ad2efb7f6025bc7f8715d4558d8a5a1242325761a12a6e696590828fa1752c579198
7
+ data.tar.gz: 7309cc34f3d022d212ab9870558909ab84be3c6550160ce822808f98edb259c96669899247a7faefec6365068ac5aa68a6d5b43f279591156f47c9f02d0e182c
data/README.md CHANGED
@@ -15,6 +15,11 @@ You can set a schedule for pipelines that are triggered by new data, specifying
15
15
  executed in the [ISO8601 duration format](https://www.digi.com/resources/documentation/digidocs//90001488-13/reference/r_iso_8601_duration_format.htm). The processing logic is then executed.
16
16
  ###### methods:
17
17
  - .new(options): options can be schedule in [ISO8601 duration format](https://www.digi.com/resources/documentation/digidocs//90001488-13/reference/r_iso_8601_duration_format.htm) and name
18
+ - options:
19
+ - name: pipeline name
20
+ - schedule: [ISO8601 duration format](https://www.digi.com/resources/documentation/digidocs//90001488-13/reference/r_iso_8601_duration_format.htm)
21
+ - cron: in cron format ex. '1 12 * * *' intervals are not supported
22
+ - uri: a directory/file to watch
18
23
  - .run: start the pipeline. blocking if a schedule is supplied
19
24
  - .stop: stop the pipeline
20
25
  - .pause: pause the pipeline. Restart using .run
@@ -37,6 +42,19 @@ end
37
42
  pipeline.run
38
43
  ```
39
44
 
45
+ ```ruby
46
+ #create a pipeline scheduled to run every morning at 06:00 am
47
+ pipeline = Pipeline.new(schedule: '0 6 * * *')
48
+
49
+ pipeline.on_message do |input, output|
50
+ data = input.from_uri("https://dummyjson.com/comments?limit=10")
51
+ # process data
52
+ end
53
+
54
+ pipeline.run
55
+ ```
56
+
57
+
40
58
  ```ruby
41
59
  #create a pipeline to listen and process files in a directory
42
60
  extract = DataCollector::Pipeline.new(name: 'extract', uri: 'file://./data/in')
@@ -50,6 +50,7 @@ Gem::Specification.new do |spec|
50
50
  spec.add_runtime_dependency 'bunny', '~> 2.20'
51
51
  spec.add_runtime_dependency 'bunny_burrow', '~> 1.5'
52
52
  spec.add_runtime_dependency 'builder', '~> 3.2'
53
+ spec.add_runtime_dependency 'parse-cron', '~> 0.1'
53
54
 
54
55
  spec.add_development_dependency 'bundler', '~> 2.3'
55
56
  spec.add_development_dependency 'minitest', '~> 5.18'
@@ -55,7 +55,7 @@ module DataCollector
55
55
  parse_uri
56
56
  server.rabbitmq_url = @bunny_uri.to_s
57
57
  server.rabbitmq_exchange = @bunny_channel
58
- server.logger = DataCollector::Core.logger
58
+ #server.logger = DataCollector::Core.logger
59
59
  end
60
60
  end
61
61
 
@@ -31,7 +31,7 @@ module DataCollector
31
31
  raise DataCollector::Error 'from_uri expects a scheme like file:// of https://' unless source =~ /:\/\//
32
32
 
33
33
  scheme, path = source.split('://')
34
- source="#{scheme}://#{URI.encode_uri_component(path)}"
34
+ source="#{scheme}://#{URI.encode_www_form_component(path)}"
35
35
  uri = URI(source)
36
36
  begin
37
37
  data = nil
@@ -41,7 +41,7 @@ module DataCollector
41
41
  when 'https'
42
42
  data = from_https(uri, options)
43
43
  when 'file'
44
- absolute_path = File.absolute_path("#{URI.decode_uri_component("#{uri.host}#{uri.path}")}")
44
+ absolute_path = File.absolute_path("#{URI.decode_www_form_component("#{uri.host}#{uri.path}")}")
45
45
  if File.directory?(absolute_path)
46
46
  #raise DataCollector::Error, "#{uri.host}/#{uri.path} not found" unless File.exist?("#{uri.host}/#{uri.path}")
47
47
  return from_dir(uri, options)
@@ -80,7 +80,7 @@ module DataCollector
80
80
  end
81
81
 
82
82
  def from_https(uri, options = {})
83
- uri = URI.decode_uri_component("#{uri.to_s}")
83
+ uri = URI.decode_www_form_component("#{uri.to_s}")
84
84
  data = nil
85
85
  if options.with_indifferent_access.include?(:logging) && options.with_indifferent_access[:logging]
86
86
  HTTP.default_options = HTTP::Options.new(features: { logging: { logger: @logger } })
@@ -102,6 +102,16 @@ module DataCollector
102
102
  http = HTTP.auth(bearer)
103
103
  end
104
104
 
105
+ if options.key?(:cookies)
106
+ @logger.debug "Set cookies"
107
+ http = http.cookies( options[:cookies] )
108
+ end
109
+
110
+ if options.key?(:headers)
111
+ @logger.debug "Set http headers"
112
+ http = http.headers( options[:headers] )
113
+ end
114
+
105
115
  if options.key?(:verify_ssl) && uri.scheme.eql?('https')
106
116
  @logger.warn "Disabling SSL verification. "
107
117
  #shouldn't use this but we all do ...
@@ -109,10 +119,11 @@ module DataCollector
109
119
  ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
110
120
 
111
121
  http_response = http.follow.get(escape_uri(uri), ssl_context: ctx)
122
+
112
123
  else
113
124
  http_response = http.follow.get(escape_uri(uri))
114
125
  end
115
-
126
+
116
127
  case http_response.code
117
128
  when 200..299
118
129
  @raw = data = http_response.body.to_s
@@ -235,7 +246,7 @@ module DataCollector
235
246
  end
236
247
 
237
248
  def normalize_uri(uri)
238
- "#{URI.decode_uri_component(uri.host)}#{URI.decode_uri_component(uri.path)}"
249
+ "#{URI.decode_www_form_component(uri.host)}#{URI.decode_www_form_component(uri.path)}"
239
250
  end
240
251
  end
241
252
  end
@@ -1,4 +1,5 @@
1
1
  require 'iso8601'
2
+ require 'parse-cron'
2
3
 
3
4
  module DataCollector
4
5
  class Pipeline
@@ -12,6 +13,7 @@ module DataCollector
12
13
  @run_count = 0
13
14
 
14
15
  @schedule = options[:schedule] || {}
16
+ @cron = options[:cron || '']
15
17
  @name = options[:name] || "pipeline-#{Time.now.to_i}-#{rand(10000)}"
16
18
  @options = options
17
19
  @listeners = []
@@ -43,6 +45,20 @@ module DataCollector
43
45
 
44
46
  DataCollector::Core.log("PIPELINE running in #{interval.size} seconds")
45
47
  sleep interval.size
48
+ handle_on_message(@input, @output) unless paused?
49
+ end
50
+ elsif @cron && !@cron.empty?
51
+ cron_parser = CronParser.new(@cron)
52
+ while running?
53
+ @run_count += 1
54
+ start_time = ISO8601::DateTime.new(Time.now.to_datetime.to_s)
55
+ next_run = cron_parser.next(start_time.to_time)
56
+
57
+ interval = ISO8601::TimeInterval.from_datetimes(start_time, ISO8601::DateTime.new(next_run.to_datetime.to_s))
58
+
59
+ DataCollector::Core.log("PIPELINE running at #{next_run.to_datetime.strftime('%Y-%m-%dT%H:%M:%S')} or in #{interval.size} seconds")
60
+ sleep interval.size
61
+
46
62
  handle_on_message(@input, @output) unless paused?
47
63
  end
48
64
  else # run once
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module DataCollector
3
- VERSION = "0.24.0"
3
+ VERSION = "0.26.0"
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_collector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.24.0
4
+ version: 0.26.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mehmet Celik
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-24 00:00:00.000000000 Z
11
+ date: 2023-08-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -206,6 +206,20 @@ dependencies:
206
206
  - - "~>"
207
207
  - !ruby/object:Gem::Version
208
208
  version: '3.2'
209
+ - !ruby/object:Gem::Dependency
210
+ name: parse-cron
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - "~>"
214
+ - !ruby/object:Gem::Version
215
+ version: '0.1'
216
+ type: :runtime
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - "~>"
221
+ - !ruby/object:Gem::Version
222
+ version: '0.1'
209
223
  - !ruby/object:Gem::Dependency
210
224
  name: bundler
211
225
  requirement: !ruby/object:Gem::Requirement
@@ -319,7 +333,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
319
333
  - !ruby/object:Gem::Version
320
334
  version: '0'
321
335
  requirements: []
322
- rubygems_version: 3.4.10
336
+ rubygems_version: 3.4.13
323
337
  signing_key:
324
338
  specification_version: 4
325
339
  summary: ETL helper library