data_collector 0.24.0 → 0.26.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7745e79eb3836ab3c469cc5da39f395d42c144940b98115c96058fb01f8a629c
4
- data.tar.gz: 77aeb246a6a23477d07195c020091999825af21c9639c0a9679017daefeea9e9
3
+ metadata.gz: 057dac92f5a83458b244a962ef0518fa093365a38c2388582de0556ac7b884ca
4
+ data.tar.gz: 9aebb908d52dda01dc3efce7dcda3e40c8b4f45e718cef94052c9d37fca604eb
5
5
  SHA512:
6
- metadata.gz: e48596ac6e5fc14be89c2aadc50416558fa4a08594d28cccee630c1157dc365a556999c82d303e4669c94e1db88d6b0cf3f044730d0057586220a6c3172b72a6
7
- data.tar.gz: 2112dfa9191e8aa948317a16d581a9ea487327030cb013b43937757c144c33ca4fc975faee91fd6731d8f043ed0cefd5ecc28a562351c5de9b96196167871e36
6
+ metadata.gz: aa91e78ec3aef0010553886539284af4aafb3ed7014ef35e6f7094377bf9ad2efb7f6025bc7f8715d4558d8a5a1242325761a12a6e696590828fa1752c579198
7
+ data.tar.gz: 7309cc34f3d022d212ab9870558909ab84be3c6550160ce822808f98edb259c96669899247a7faefec6365068ac5aa68a6d5b43f279591156f47c9f02d0e182c
data/README.md CHANGED
@@ -15,6 +15,11 @@ You can set a schedule for pipelines that are triggered by new data, specifying
15
15
  executed in the [ISO8601 duration format](https://www.digi.com/resources/documentation/digidocs//90001488-13/reference/r_iso_8601_duration_format.htm). The processing logic is then executed.
16
16
  ###### methods:
17
17
  - .new(options): options can be schedule in [ISO8601 duration format](https://www.digi.com/resources/documentation/digidocs//90001488-13/reference/r_iso_8601_duration_format.htm) and name
18
+ - options:
19
+ - name: pipeline name
20
+ - schedule: [ISO8601 duration format](https://www.digi.com/resources/documentation/digidocs//90001488-13/reference/r_iso_8601_duration_format.htm)
21
+ - cron: in cron format ex. '1 12 * * *' intervals are not supported
22
+ - uri: a directory/file to watch
18
23
  - .run: start the pipeline. blocking if a schedule is supplied
19
24
  - .stop: stop the pipeline
20
25
  - .pause: pause the pipeline. Restart using .run
@@ -37,6 +42,19 @@ end
37
42
  pipeline.run
38
43
  ```
39
44
 
45
+ ```ruby
46
+ #create a pipeline scheduled to run every morning at 06:00 am
47
+ pipeline = Pipeline.new(schedule: '0 6 * * *')
48
+
49
+ pipeline.on_message do |input, output|
50
+ data = input.from_uri("https://dummyjson.com/comments?limit=10")
51
+ # process data
52
+ end
53
+
54
+ pipeline.run
55
+ ```
56
+
57
+
40
58
  ```ruby
41
59
  #create a pipeline to listen and process files in a directory
42
60
  extract = DataCollector::Pipeline.new(name: 'extract', uri: 'file://./data/in')
@@ -50,6 +50,7 @@ Gem::Specification.new do |spec|
50
50
  spec.add_runtime_dependency 'bunny', '~> 2.20'
51
51
  spec.add_runtime_dependency 'bunny_burrow', '~> 1.5'
52
52
  spec.add_runtime_dependency 'builder', '~> 3.2'
53
+ spec.add_runtime_dependency 'parse-cron', '~> 0.1'
53
54
 
54
55
  spec.add_development_dependency 'bundler', '~> 2.3'
55
56
  spec.add_development_dependency 'minitest', '~> 5.18'
@@ -55,7 +55,7 @@ module DataCollector
55
55
  parse_uri
56
56
  server.rabbitmq_url = @bunny_uri.to_s
57
57
  server.rabbitmq_exchange = @bunny_channel
58
- server.logger = DataCollector::Core.logger
58
+ #server.logger = DataCollector::Core.logger
59
59
  end
60
60
  end
61
61
 
@@ -31,7 +31,7 @@ module DataCollector
31
31
  raise DataCollector::Error 'from_uri expects a scheme like file:// of https://' unless source =~ /:\/\//
32
32
 
33
33
  scheme, path = source.split('://')
34
- source="#{scheme}://#{URI.encode_uri_component(path)}"
34
+ source="#{scheme}://#{URI.encode_www_form_component(path)}"
35
35
  uri = URI(source)
36
36
  begin
37
37
  data = nil
@@ -41,7 +41,7 @@ module DataCollector
41
41
  when 'https'
42
42
  data = from_https(uri, options)
43
43
  when 'file'
44
- absolute_path = File.absolute_path("#{URI.decode_uri_component("#{uri.host}#{uri.path}")}")
44
+ absolute_path = File.absolute_path("#{URI.decode_www_form_component("#{uri.host}#{uri.path}")}")
45
45
  if File.directory?(absolute_path)
46
46
  #raise DataCollector::Error, "#{uri.host}/#{uri.path} not found" unless File.exist?("#{uri.host}/#{uri.path}")
47
47
  return from_dir(uri, options)
@@ -80,7 +80,7 @@ module DataCollector
80
80
  end
81
81
 
82
82
  def from_https(uri, options = {})
83
- uri = URI.decode_uri_component("#{uri.to_s}")
83
+ uri = URI.decode_www_form_component("#{uri.to_s}")
84
84
  data = nil
85
85
  if options.with_indifferent_access.include?(:logging) && options.with_indifferent_access[:logging]
86
86
  HTTP.default_options = HTTP::Options.new(features: { logging: { logger: @logger } })
@@ -102,6 +102,16 @@ module DataCollector
102
102
  http = HTTP.auth(bearer)
103
103
  end
104
104
 
105
+ if options.key?(:cookies)
106
+ @logger.debug "Set cookies"
107
+ http = http.cookies( options[:cookies] )
108
+ end
109
+
110
+ if options.key?(:headers)
111
+ @logger.debug "Set http headers"
112
+ http = http.headers( options[:headers] )
113
+ end
114
+
105
115
  if options.key?(:verify_ssl) && uri.scheme.eql?('https')
106
116
  @logger.warn "Disabling SSL verification. "
107
117
  #shouldn't use this but we all do ...
@@ -109,10 +119,11 @@ module DataCollector
109
119
  ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
110
120
 
111
121
  http_response = http.follow.get(escape_uri(uri), ssl_context: ctx)
122
+
112
123
  else
113
124
  http_response = http.follow.get(escape_uri(uri))
114
125
  end
115
-
126
+
116
127
  case http_response.code
117
128
  when 200..299
118
129
  @raw = data = http_response.body.to_s
@@ -235,7 +246,7 @@ module DataCollector
235
246
  end
236
247
 
237
248
  def normalize_uri(uri)
238
- "#{URI.decode_uri_component(uri.host)}#{URI.decode_uri_component(uri.path)}"
249
+ "#{URI.decode_www_form_component(uri.host)}#{URI.decode_www_form_component(uri.path)}"
239
250
  end
240
251
  end
241
252
  end
@@ -1,4 +1,5 @@
1
1
  require 'iso8601'
2
+ require 'parse-cron'
2
3
 
3
4
  module DataCollector
4
5
  class Pipeline
@@ -12,6 +13,7 @@ module DataCollector
12
13
  @run_count = 0
13
14
 
14
15
  @schedule = options[:schedule] || {}
16
+ @cron = options[:cron || '']
15
17
  @name = options[:name] || "pipeline-#{Time.now.to_i}-#{rand(10000)}"
16
18
  @options = options
17
19
  @listeners = []
@@ -43,6 +45,20 @@ module DataCollector
43
45
 
44
46
  DataCollector::Core.log("PIPELINE running in #{interval.size} seconds")
45
47
  sleep interval.size
48
+ handle_on_message(@input, @output) unless paused?
49
+ end
50
+ elsif @cron && !@cron.empty?
51
+ cron_parser = CronParser.new(@cron)
52
+ while running?
53
+ @run_count += 1
54
+ start_time = ISO8601::DateTime.new(Time.now.to_datetime.to_s)
55
+ next_run = cron_parser.next(start_time.to_time)
56
+
57
+ interval = ISO8601::TimeInterval.from_datetimes(start_time, ISO8601::DateTime.new(next_run.to_datetime.to_s))
58
+
59
+ DataCollector::Core.log("PIPELINE running at #{next_run.to_datetime.strftime('%Y-%m-%dT%H:%M:%S')} or in #{interval.size} seconds")
60
+ sleep interval.size
61
+
46
62
  handle_on_message(@input, @output) unless paused?
47
63
  end
48
64
  else # run once
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module DataCollector
3
- VERSION = "0.24.0"
3
+ VERSION = "0.26.0"
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_collector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.24.0
4
+ version: 0.26.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mehmet Celik
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-24 00:00:00.000000000 Z
11
+ date: 2023-08-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -206,6 +206,20 @@ dependencies:
206
206
  - - "~>"
207
207
  - !ruby/object:Gem::Version
208
208
  version: '3.2'
209
+ - !ruby/object:Gem::Dependency
210
+ name: parse-cron
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - "~>"
214
+ - !ruby/object:Gem::Version
215
+ version: '0.1'
216
+ type: :runtime
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - "~>"
221
+ - !ruby/object:Gem::Version
222
+ version: '0.1'
209
223
  - !ruby/object:Gem::Dependency
210
224
  name: bundler
211
225
  requirement: !ruby/object:Gem::Requirement
@@ -319,7 +333,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
319
333
  - !ruby/object:Gem::Version
320
334
  version: '0'
321
335
  requirements: []
322
- rubygems_version: 3.4.10
336
+ rubygems_version: 3.4.13
323
337
  signing_key:
324
338
  specification_version: 4
325
339
  summary: ETL helper library