data_collector 0.34.0 → 0.36.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2dfc0690cc1200254a8d5af979048c4b1049d481a7ae31dd8984634a820e1db3
4
- data.tar.gz: c9cfdea232374ccb3490b8a02e60d3ab8e418ae5300cf367956482d5888f775c
3
+ metadata.gz: 649b618a7676b2153451b2bc3053726dd332ccc95a58ce0fe7115f4c9cc5a7eb
4
+ data.tar.gz: 625ca0e9da4e6b986d7ead973c80fc04a87a5539f0f7263efc672ee0f3ef7eb1
5
5
  SHA512:
6
- metadata.gz: 649feb684c3449321da8611f525f09319ab592a5f810dfb7da2258d120519a32c4ac80072669d525af3ff935373842d9db03dedb27e04c16dcfd25a7dfc4435d
7
- data.tar.gz: 292f698256ed9e8ab5e1ddc2903edcb0ab17a7471de5d2537fa8f099ed9259030afb576040485e0f5f899c456e0360dff69b7bae7f0907f1be079661cdb7a60d
6
+ metadata.gz: 2372b50b9fc9e6f5a17575c1755e169f0ddbaef44ed96847d8f5c87b28a3e30e46276cf47f6e506fe843a2293190b54fd556978ea175f937693abd4f83a637dd
7
+ data.tar.gz: 7a417a9ee3a70140a3601d20613159222f6c2d6d0d7feaa99c447c7e31306f14c6f08c50543cb8fda85315b5fe62abde202e2b3bda51ac421f6a517f7b6be740
data/README.md CHANGED
@@ -84,6 +84,8 @@ A push happens when new data is created in a directory, message queue, ...
84
84
  - content_type: _string_ force a content_type if the 'Content-Type' returned by the http server is incorrect
85
85
  - headers: request headers
86
86
  - cookies: session cookies etc.
87
+ - method: http verb one of [GET, POST] defaul('GET')
88
+ - body: http post body
87
89
 
88
90
  ###### example:
89
91
  ```ruby
@@ -91,6 +93,8 @@ A push happens when new data is created in a directory, message queue, ...
91
93
  input.from_uri("http://www.libis.be")
92
94
  input.from_uri("file://hello.txt")
93
95
  input.from_uri("http://www.libis.be/record.jsonld", content_type: 'application/ld+json')
96
+ input.from_uri("https://www.w3.org/TR/rdf12-turtle/examples/example1.ttl")
97
+ input.from_uri("https://dbpedia.org/sparql", body: "query=SELECT * WHERE {?sub ?pred ?obj} LIMIT 10", method:"POST", headers: {accept: "text/turtle"})
94
98
 
95
99
  # read data from a RabbitMQ queue
96
100
  listener = input.from_uri('amqp://user:password@localhost?channel=hello&queue=world')
@@ -399,6 +403,13 @@ Should give as output
399
403
  </data>
400
404
  ```
401
405
 
406
+ You can provide options to input.from_uri for better reading CSV formats these
407
+ are the same the Ruby [CSV](https://docs.ruby-lang.org/en/master/CSV.html#class-CSV-label-Options) class
408
+
409
+ Loading a CSV file with **;** as the row seperator
410
+ ```ruby
411
+ i = input.from_uri('https://support.staffbase.com/hc/en-us/article_attachments/360009197031/username.csv', col_sep: ';')
412
+ ```
402
413
 
403
414
  ## Installation
404
415
 
@@ -39,7 +39,6 @@ Gem::Specification.new do |spec|
39
39
  spec.add_runtime_dependency 'activesupport', '~> 7.0'
40
40
  spec.add_runtime_dependency 'http', '~> 5.1'
41
41
  spec.add_runtime_dependency 'json', '~> 2.6'
42
- spec.add_runtime_dependency 'json-ld', '~> 3.3'
43
42
  spec.add_runtime_dependency 'jsonpath', '~> 1.1'
44
43
  spec.add_runtime_dependency 'mime-types', '~> 3.5'
45
44
  spec.add_runtime_dependency 'minitar', '= 0.9'
@@ -51,6 +50,7 @@ Gem::Specification.new do |spec|
51
50
  spec.add_runtime_dependency 'bunny_burrow', '~> 1.5'
52
51
  spec.add_runtime_dependency 'builder', '~> 3.2'
53
52
  spec.add_runtime_dependency 'parse-cron', '~> 0.1'
53
+ spec.add_runtime_dependency 'linkeddata', '~> 3.3'
54
54
 
55
55
  spec.add_development_dependency 'bundler', '~> 2.3'
56
56
  spec.add_development_dependency 'minitest', '~> 5.18'
@@ -2,7 +2,7 @@
2
2
  require 'http'
3
3
  require 'open-uri'
4
4
  require 'nokogiri'
5
- require 'json/ld'
5
+ require 'linkeddata'
6
6
  require 'nori'
7
7
  require 'uri'
8
8
  require 'logger'
@@ -122,15 +122,24 @@ module DataCollector
122
122
  http = http.headers(options[:headers])
123
123
  end
124
124
 
125
+ ctx = nil
126
+ http_query_options = {}
125
127
  if options.key?(:verify_ssl) && uri.scheme.eql?('https')
126
128
  @logger.warn "Disabling SSL verification. "
127
129
  # shouldn't use this but we all do ...
128
130
  ctx = OpenSSL::SSL::SSLContext.new
129
131
  ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
130
132
 
131
- http_response = http.follow.get(escape_uri(uri), ssl_context: ctx)
133
+ http_query_options[:ssl_context] = ctx
134
+ end
135
+
136
+ if options.key?(:method) && options[:method].downcase.eql?('post')
137
+ raise DataCollector::InputError, "No body found, a POST request needs a body" unless options.key?(:body)
138
+ http_query_options[:body] = options[:body]
139
+
140
+ http_response = http.follow.post(escape_uri(uri), http_query_options)
132
141
  else
133
- http_response = http.follow.get(escape_uri(uri))
142
+ http_response = http.follow.get(escape_uri(uri), http_query_options)
134
143
  end
135
144
 
136
145
  case http_response.code
@@ -152,11 +161,16 @@ module DataCollector
152
161
  when 'application/atom+xml'
153
162
  data = xml_to_hash(data, options)
154
163
  when 'text/csv'
155
- data = csv_to_hash(data)
164
+ data = csv_to_hash(data, options)
156
165
  when 'application/xml'
157
166
  data = xml_to_hash(data, options)
158
167
  when 'text/xml'
159
168
  data = xml_to_hash(data, options)
169
+ when 'text/turtle'
170
+ graph = RDF::Graph.new do |graph|
171
+ RDF::Turtle::Reader.new(data) {|reader| graph << reader}
172
+ end
173
+ data = JSON.parse(graph.dump(:jsonld, validate: false, standard_prefixes: true))
160
174
  else
161
175
  data = xml_to_hash(data, options)
162
176
  end
@@ -171,7 +185,7 @@ module DataCollector
171
185
  when 404
172
186
  raise DataCollector::InputError, 'Not found'
173
187
  else
174
- raise DataCollector::InputError, "Unable to process received status code = #{http_response.code}"
188
+ raise DataCollector::InputError, "Unable to process received status code = #{http_response.code} error= #{http_response.body.to_s}"
175
189
  end
176
190
 
177
191
  #[data, http_response.code]
@@ -217,7 +231,7 @@ module DataCollector
217
231
  end #entry
218
232
  end #tar
219
233
  when '.csv'
220
- data = csv_to_hash(data)
234
+ data = csv_to_hash(data, options)
221
235
  else
222
236
  raise "Do not know how to process #{uri.to_s}"
223
237
  end
@@ -250,8 +264,15 @@ module DataCollector
250
264
  nori.parse(data)
251
265
  end
252
266
 
253
- def csv_to_hash(data)
254
- csv = CSV.parse(data, headers: true, header_converters: [:downcase, :symbol])
267
+ def csv_to_hash(data, options = {})
268
+ csv_option_keys = options.keys & CSV::DEFAULT_OPTIONS.keys
269
+ all_cvs_options = {headers: true, header_converters: [:downcase, :symbol]}
270
+
271
+ csv_option_keys.each do |k|
272
+ all_cvs_options[k] = options[k]
273
+ end
274
+
275
+ csv = CSV.parse(data, **all_cvs_options)
255
276
 
256
277
  csv.collect do |record|
257
278
  record.to_hash
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module DataCollector
3
- VERSION = "0.34.0"
3
+ VERSION = "0.36.0"
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_collector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.34.0
4
+ version: 0.36.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mehmet Celik
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-11-09 00:00:00.000000000 Z
11
+ date: 2023-11-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -52,20 +52,6 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '2.6'
55
- - !ruby/object:Gem::Dependency
56
- name: json-ld
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - "~>"
60
- - !ruby/object:Gem::Version
61
- version: '3.3'
62
- type: :runtime
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - "~>"
67
- - !ruby/object:Gem::Version
68
- version: '3.3'
69
55
  - !ruby/object:Gem::Dependency
70
56
  name: jsonpath
71
57
  requirement: !ruby/object:Gem::Requirement
@@ -220,6 +206,20 @@ dependencies:
220
206
  - - "~>"
221
207
  - !ruby/object:Gem::Version
222
208
  version: '0.1'
209
+ - !ruby/object:Gem::Dependency
210
+ name: linkeddata
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - "~>"
214
+ - !ruby/object:Gem::Version
215
+ version: '3.3'
216
+ type: :runtime
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - "~>"
221
+ - !ruby/object:Gem::Version
222
+ version: '3.3'
223
223
  - !ruby/object:Gem::Dependency
224
224
  name: bundler
225
225
  requirement: !ruby/object:Gem::Requirement
@@ -334,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
334
334
  - !ruby/object:Gem::Version
335
335
  version: '0'
336
336
  requirements: []
337
- rubygems_version: 3.4.10
337
+ rubygems_version: 3.4.21
338
338
  signing_key:
339
339
  specification_version: 4
340
340
  summary: ETL helper library