data_collector 0.34.0 → 0.36.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -0
- data/data_collector.gemspec +1 -1
- data/lib/data_collector/input.rb +29 -8
- data/lib/data_collector/version.rb +1 -1
- metadata +17 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 649b618a7676b2153451b2bc3053726dd332ccc95a58ce0fe7115f4c9cc5a7eb
|
4
|
+
data.tar.gz: 625ca0e9da4e6b986d7ead973c80fc04a87a5539f0f7263efc672ee0f3ef7eb1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2372b50b9fc9e6f5a17575c1755e169f0ddbaef44ed96847d8f5c87b28a3e30e46276cf47f6e506fe843a2293190b54fd556978ea175f937693abd4f83a637dd
|
7
|
+
data.tar.gz: 7a417a9ee3a70140a3601d20613159222f6c2d6d0d7feaa99c447c7e31306f14c6f08c50543cb8fda85315b5fe62abde202e2b3bda51ac421f6a517f7b6be740
|
data/README.md
CHANGED
@@ -84,6 +84,8 @@ A push happens when new data is created in a directory, message queue, ...
|
|
84
84
|
- content_type: _string_ force a content_type if the 'Content-Type' returned by the http server is incorrect
|
85
85
|
- headers: request headers
|
86
86
|
- cookies: session cookies etc.
|
87
|
+
- method: http verb one of [GET, POST] defaul('GET')
|
88
|
+
- body: http post body
|
87
89
|
|
88
90
|
###### example:
|
89
91
|
```ruby
|
@@ -91,6 +93,8 @@ A push happens when new data is created in a directory, message queue, ...
|
|
91
93
|
input.from_uri("http://www.libis.be")
|
92
94
|
input.from_uri("file://hello.txt")
|
93
95
|
input.from_uri("http://www.libis.be/record.jsonld", content_type: 'application/ld+json')
|
96
|
+
input.from_uri("https://www.w3.org/TR/rdf12-turtle/examples/example1.ttl")
|
97
|
+
input.from_uri("https://dbpedia.org/sparql", body: "query=SELECT * WHERE {?sub ?pred ?obj} LIMIT 10", method:"POST", headers: {accept: "text/turtle"})
|
94
98
|
|
95
99
|
# read data from a RabbitMQ queue
|
96
100
|
listener = input.from_uri('amqp://user:password@localhost?channel=hello&queue=world')
|
@@ -399,6 +403,13 @@ Should give as output
|
|
399
403
|
</data>
|
400
404
|
```
|
401
405
|
|
406
|
+
You can provide options to input.from_uri for better reading CSV formats these
|
407
|
+
are the same the Ruby [CSV](https://docs.ruby-lang.org/en/master/CSV.html#class-CSV-label-Options) class
|
408
|
+
|
409
|
+
Loading a CSV file with **;** as the row seperator
|
410
|
+
```ruby
|
411
|
+
i = input.from_uri('https://support.staffbase.com/hc/en-us/article_attachments/360009197031/username.csv', col_sep: ';')
|
412
|
+
```
|
402
413
|
|
403
414
|
## Installation
|
404
415
|
|
data/data_collector.gemspec
CHANGED
@@ -39,7 +39,6 @@ Gem::Specification.new do |spec|
|
|
39
39
|
spec.add_runtime_dependency 'activesupport', '~> 7.0'
|
40
40
|
spec.add_runtime_dependency 'http', '~> 5.1'
|
41
41
|
spec.add_runtime_dependency 'json', '~> 2.6'
|
42
|
-
spec.add_runtime_dependency 'json-ld', '~> 3.3'
|
43
42
|
spec.add_runtime_dependency 'jsonpath', '~> 1.1'
|
44
43
|
spec.add_runtime_dependency 'mime-types', '~> 3.5'
|
45
44
|
spec.add_runtime_dependency 'minitar', '= 0.9'
|
@@ -51,6 +50,7 @@ Gem::Specification.new do |spec|
|
|
51
50
|
spec.add_runtime_dependency 'bunny_burrow', '~> 1.5'
|
52
51
|
spec.add_runtime_dependency 'builder', '~> 3.2'
|
53
52
|
spec.add_runtime_dependency 'parse-cron', '~> 0.1'
|
53
|
+
spec.add_runtime_dependency 'linkeddata', '~> 3.3'
|
54
54
|
|
55
55
|
spec.add_development_dependency 'bundler', '~> 2.3'
|
56
56
|
spec.add_development_dependency 'minitest', '~> 5.18'
|
data/lib/data_collector/input.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
require 'http'
|
3
3
|
require 'open-uri'
|
4
4
|
require 'nokogiri'
|
5
|
-
require '
|
5
|
+
require 'linkeddata'
|
6
6
|
require 'nori'
|
7
7
|
require 'uri'
|
8
8
|
require 'logger'
|
@@ -122,15 +122,24 @@ module DataCollector
|
|
122
122
|
http = http.headers(options[:headers])
|
123
123
|
end
|
124
124
|
|
125
|
+
ctx = nil
|
126
|
+
http_query_options = {}
|
125
127
|
if options.key?(:verify_ssl) && uri.scheme.eql?('https')
|
126
128
|
@logger.warn "Disabling SSL verification. "
|
127
129
|
# shouldn't use this but we all do ...
|
128
130
|
ctx = OpenSSL::SSL::SSLContext.new
|
129
131
|
ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
130
132
|
|
131
|
-
|
133
|
+
http_query_options[:ssl_context] = ctx
|
134
|
+
end
|
135
|
+
|
136
|
+
if options.key?(:method) && options[:method].downcase.eql?('post')
|
137
|
+
raise DataCollector::InputError, "No body found, a POST request needs a body" unless options.key?(:body)
|
138
|
+
http_query_options[:body] = options[:body]
|
139
|
+
|
140
|
+
http_response = http.follow.post(escape_uri(uri), http_query_options)
|
132
141
|
else
|
133
|
-
http_response = http.follow.get(escape_uri(uri))
|
142
|
+
http_response = http.follow.get(escape_uri(uri), http_query_options)
|
134
143
|
end
|
135
144
|
|
136
145
|
case http_response.code
|
@@ -152,11 +161,16 @@ module DataCollector
|
|
152
161
|
when 'application/atom+xml'
|
153
162
|
data = xml_to_hash(data, options)
|
154
163
|
when 'text/csv'
|
155
|
-
data = csv_to_hash(data)
|
164
|
+
data = csv_to_hash(data, options)
|
156
165
|
when 'application/xml'
|
157
166
|
data = xml_to_hash(data, options)
|
158
167
|
when 'text/xml'
|
159
168
|
data = xml_to_hash(data, options)
|
169
|
+
when 'text/turtle'
|
170
|
+
graph = RDF::Graph.new do |graph|
|
171
|
+
RDF::Turtle::Reader.new(data) {|reader| graph << reader}
|
172
|
+
end
|
173
|
+
data = JSON.parse(graph.dump(:jsonld, validate: false, standard_prefixes: true))
|
160
174
|
else
|
161
175
|
data = xml_to_hash(data, options)
|
162
176
|
end
|
@@ -171,7 +185,7 @@ module DataCollector
|
|
171
185
|
when 404
|
172
186
|
raise DataCollector::InputError, 'Not found'
|
173
187
|
else
|
174
|
-
raise DataCollector::InputError, "Unable to process received status code = #{http_response.code}"
|
188
|
+
raise DataCollector::InputError, "Unable to process received status code = #{http_response.code} error= #{http_response.body.to_s}"
|
175
189
|
end
|
176
190
|
|
177
191
|
#[data, http_response.code]
|
@@ -217,7 +231,7 @@ module DataCollector
|
|
217
231
|
end #entry
|
218
232
|
end #tar
|
219
233
|
when '.csv'
|
220
|
-
data = csv_to_hash(data)
|
234
|
+
data = csv_to_hash(data, options)
|
221
235
|
else
|
222
236
|
raise "Do not know how to process #{uri.to_s}"
|
223
237
|
end
|
@@ -250,8 +264,15 @@ module DataCollector
|
|
250
264
|
nori.parse(data)
|
251
265
|
end
|
252
266
|
|
253
|
-
def csv_to_hash(data)
|
254
|
-
|
267
|
+
def csv_to_hash(data, options = {})
|
268
|
+
csv_option_keys = options.keys & CSV::DEFAULT_OPTIONS.keys
|
269
|
+
all_cvs_options = {headers: true, header_converters: [:downcase, :symbol]}
|
270
|
+
|
271
|
+
csv_option_keys.each do |k|
|
272
|
+
all_cvs_options[k] = options[k]
|
273
|
+
end
|
274
|
+
|
275
|
+
csv = CSV.parse(data, **all_cvs_options)
|
255
276
|
|
256
277
|
csv.collect do |record|
|
257
278
|
record.to_hash
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_collector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.36.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Celik
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-11-
|
11
|
+
date: 2023-11-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -52,20 +52,6 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '2.6'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: json-ld
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - "~>"
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '3.3'
|
62
|
-
type: :runtime
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - "~>"
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '3.3'
|
69
55
|
- !ruby/object:Gem::Dependency
|
70
56
|
name: jsonpath
|
71
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -220,6 +206,20 @@ dependencies:
|
|
220
206
|
- - "~>"
|
221
207
|
- !ruby/object:Gem::Version
|
222
208
|
version: '0.1'
|
209
|
+
- !ruby/object:Gem::Dependency
|
210
|
+
name: linkeddata
|
211
|
+
requirement: !ruby/object:Gem::Requirement
|
212
|
+
requirements:
|
213
|
+
- - "~>"
|
214
|
+
- !ruby/object:Gem::Version
|
215
|
+
version: '3.3'
|
216
|
+
type: :runtime
|
217
|
+
prerelease: false
|
218
|
+
version_requirements: !ruby/object:Gem::Requirement
|
219
|
+
requirements:
|
220
|
+
- - "~>"
|
221
|
+
- !ruby/object:Gem::Version
|
222
|
+
version: '3.3'
|
223
223
|
- !ruby/object:Gem::Dependency
|
224
224
|
name: bundler
|
225
225
|
requirement: !ruby/object:Gem::Requirement
|
@@ -334,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
334
334
|
- !ruby/object:Gem::Version
|
335
335
|
version: '0'
|
336
336
|
requirements: []
|
337
|
-
rubygems_version: 3.4.
|
337
|
+
rubygems_version: 3.4.21
|
338
338
|
signing_key:
|
339
339
|
specification_version: 4
|
340
340
|
summary: ETL helper library
|