data_collector 0.34.0 → 0.36.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +11 -0
- data/data_collector.gemspec +1 -1
- data/lib/data_collector/input.rb +29 -8
- data/lib/data_collector/version.rb +1 -1
- metadata +17 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 649b618a7676b2153451b2bc3053726dd332ccc95a58ce0fe7115f4c9cc5a7eb
|
4
|
+
data.tar.gz: 625ca0e9da4e6b986d7ead973c80fc04a87a5539f0f7263efc672ee0f3ef7eb1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2372b50b9fc9e6f5a17575c1755e169f0ddbaef44ed96847d8f5c87b28a3e30e46276cf47f6e506fe843a2293190b54fd556978ea175f937693abd4f83a637dd
|
7
|
+
data.tar.gz: 7a417a9ee3a70140a3601d20613159222f6c2d6d0d7feaa99c447c7e31306f14c6f08c50543cb8fda85315b5fe62abde202e2b3bda51ac421f6a517f7b6be740
|
data/README.md
CHANGED
@@ -84,6 +84,8 @@ A push happens when new data is created in a directory, message queue, ...
|
|
84
84
|
- content_type: _string_ force a content_type if the 'Content-Type' returned by the http server is incorrect
|
85
85
|
- headers: request headers
|
86
86
|
- cookies: session cookies etc.
|
87
|
+
- method: http verb one of [GET, POST] defaul('GET')
|
88
|
+
- body: http post body
|
87
89
|
|
88
90
|
###### example:
|
89
91
|
```ruby
|
@@ -91,6 +93,8 @@ A push happens when new data is created in a directory, message queue, ...
|
|
91
93
|
input.from_uri("http://www.libis.be")
|
92
94
|
input.from_uri("file://hello.txt")
|
93
95
|
input.from_uri("http://www.libis.be/record.jsonld", content_type: 'application/ld+json')
|
96
|
+
input.from_uri("https://www.w3.org/TR/rdf12-turtle/examples/example1.ttl")
|
97
|
+
input.from_uri("https://dbpedia.org/sparql", body: "query=SELECT * WHERE {?sub ?pred ?obj} LIMIT 10", method:"POST", headers: {accept: "text/turtle"})
|
94
98
|
|
95
99
|
# read data from a RabbitMQ queue
|
96
100
|
listener = input.from_uri('amqp://user:password@localhost?channel=hello&queue=world')
|
@@ -399,6 +403,13 @@ Should give as output
|
|
399
403
|
</data>
|
400
404
|
```
|
401
405
|
|
406
|
+
You can provide options to input.from_uri for better reading CSV formats these
|
407
|
+
are the same the Ruby [CSV](https://docs.ruby-lang.org/en/master/CSV.html#class-CSV-label-Options) class
|
408
|
+
|
409
|
+
Loading a CSV file with **;** as the row seperator
|
410
|
+
```ruby
|
411
|
+
i = input.from_uri('https://support.staffbase.com/hc/en-us/article_attachments/360009197031/username.csv', col_sep: ';')
|
412
|
+
```
|
402
413
|
|
403
414
|
## Installation
|
404
415
|
|
data/data_collector.gemspec
CHANGED
@@ -39,7 +39,6 @@ Gem::Specification.new do |spec|
|
|
39
39
|
spec.add_runtime_dependency 'activesupport', '~> 7.0'
|
40
40
|
spec.add_runtime_dependency 'http', '~> 5.1'
|
41
41
|
spec.add_runtime_dependency 'json', '~> 2.6'
|
42
|
-
spec.add_runtime_dependency 'json-ld', '~> 3.3'
|
43
42
|
spec.add_runtime_dependency 'jsonpath', '~> 1.1'
|
44
43
|
spec.add_runtime_dependency 'mime-types', '~> 3.5'
|
45
44
|
spec.add_runtime_dependency 'minitar', '= 0.9'
|
@@ -51,6 +50,7 @@ Gem::Specification.new do |spec|
|
|
51
50
|
spec.add_runtime_dependency 'bunny_burrow', '~> 1.5'
|
52
51
|
spec.add_runtime_dependency 'builder', '~> 3.2'
|
53
52
|
spec.add_runtime_dependency 'parse-cron', '~> 0.1'
|
53
|
+
spec.add_runtime_dependency 'linkeddata', '~> 3.3'
|
54
54
|
|
55
55
|
spec.add_development_dependency 'bundler', '~> 2.3'
|
56
56
|
spec.add_development_dependency 'minitest', '~> 5.18'
|
data/lib/data_collector/input.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
require 'http'
|
3
3
|
require 'open-uri'
|
4
4
|
require 'nokogiri'
|
5
|
-
require '
|
5
|
+
require 'linkeddata'
|
6
6
|
require 'nori'
|
7
7
|
require 'uri'
|
8
8
|
require 'logger'
|
@@ -122,15 +122,24 @@ module DataCollector
|
|
122
122
|
http = http.headers(options[:headers])
|
123
123
|
end
|
124
124
|
|
125
|
+
ctx = nil
|
126
|
+
http_query_options = {}
|
125
127
|
if options.key?(:verify_ssl) && uri.scheme.eql?('https')
|
126
128
|
@logger.warn "Disabling SSL verification. "
|
127
129
|
# shouldn't use this but we all do ...
|
128
130
|
ctx = OpenSSL::SSL::SSLContext.new
|
129
131
|
ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
130
132
|
|
131
|
-
|
133
|
+
http_query_options[:ssl_context] = ctx
|
134
|
+
end
|
135
|
+
|
136
|
+
if options.key?(:method) && options[:method].downcase.eql?('post')
|
137
|
+
raise DataCollector::InputError, "No body found, a POST request needs a body" unless options.key?(:body)
|
138
|
+
http_query_options[:body] = options[:body]
|
139
|
+
|
140
|
+
http_response = http.follow.post(escape_uri(uri), http_query_options)
|
132
141
|
else
|
133
|
-
http_response = http.follow.get(escape_uri(uri))
|
142
|
+
http_response = http.follow.get(escape_uri(uri), http_query_options)
|
134
143
|
end
|
135
144
|
|
136
145
|
case http_response.code
|
@@ -152,11 +161,16 @@ module DataCollector
|
|
152
161
|
when 'application/atom+xml'
|
153
162
|
data = xml_to_hash(data, options)
|
154
163
|
when 'text/csv'
|
155
|
-
data = csv_to_hash(data)
|
164
|
+
data = csv_to_hash(data, options)
|
156
165
|
when 'application/xml'
|
157
166
|
data = xml_to_hash(data, options)
|
158
167
|
when 'text/xml'
|
159
168
|
data = xml_to_hash(data, options)
|
169
|
+
when 'text/turtle'
|
170
|
+
graph = RDF::Graph.new do |graph|
|
171
|
+
RDF::Turtle::Reader.new(data) {|reader| graph << reader}
|
172
|
+
end
|
173
|
+
data = JSON.parse(graph.dump(:jsonld, validate: false, standard_prefixes: true))
|
160
174
|
else
|
161
175
|
data = xml_to_hash(data, options)
|
162
176
|
end
|
@@ -171,7 +185,7 @@ module DataCollector
|
|
171
185
|
when 404
|
172
186
|
raise DataCollector::InputError, 'Not found'
|
173
187
|
else
|
174
|
-
raise DataCollector::InputError, "Unable to process received status code = #{http_response.code}"
|
188
|
+
raise DataCollector::InputError, "Unable to process received status code = #{http_response.code} error= #{http_response.body.to_s}"
|
175
189
|
end
|
176
190
|
|
177
191
|
#[data, http_response.code]
|
@@ -217,7 +231,7 @@ module DataCollector
|
|
217
231
|
end #entry
|
218
232
|
end #tar
|
219
233
|
when '.csv'
|
220
|
-
data = csv_to_hash(data)
|
234
|
+
data = csv_to_hash(data, options)
|
221
235
|
else
|
222
236
|
raise "Do not know how to process #{uri.to_s}"
|
223
237
|
end
|
@@ -250,8 +264,15 @@ module DataCollector
|
|
250
264
|
nori.parse(data)
|
251
265
|
end
|
252
266
|
|
253
|
-
def csv_to_hash(data)
|
254
|
-
|
267
|
+
def csv_to_hash(data, options = {})
|
268
|
+
csv_option_keys = options.keys & CSV::DEFAULT_OPTIONS.keys
|
269
|
+
all_cvs_options = {headers: true, header_converters: [:downcase, :symbol]}
|
270
|
+
|
271
|
+
csv_option_keys.each do |k|
|
272
|
+
all_cvs_options[k] = options[k]
|
273
|
+
end
|
274
|
+
|
275
|
+
csv = CSV.parse(data, **all_cvs_options)
|
255
276
|
|
256
277
|
csv.collect do |record|
|
257
278
|
record.to_hash
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_collector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.36.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Celik
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-11-
|
11
|
+
date: 2023-11-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -52,20 +52,6 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '2.6'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: json-ld
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - "~>"
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '3.3'
|
62
|
-
type: :runtime
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - "~>"
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '3.3'
|
69
55
|
- !ruby/object:Gem::Dependency
|
70
56
|
name: jsonpath
|
71
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -220,6 +206,20 @@ dependencies:
|
|
220
206
|
- - "~>"
|
221
207
|
- !ruby/object:Gem::Version
|
222
208
|
version: '0.1'
|
209
|
+
- !ruby/object:Gem::Dependency
|
210
|
+
name: linkeddata
|
211
|
+
requirement: !ruby/object:Gem::Requirement
|
212
|
+
requirements:
|
213
|
+
- - "~>"
|
214
|
+
- !ruby/object:Gem::Version
|
215
|
+
version: '3.3'
|
216
|
+
type: :runtime
|
217
|
+
prerelease: false
|
218
|
+
version_requirements: !ruby/object:Gem::Requirement
|
219
|
+
requirements:
|
220
|
+
- - "~>"
|
221
|
+
- !ruby/object:Gem::Version
|
222
|
+
version: '3.3'
|
223
223
|
- !ruby/object:Gem::Dependency
|
224
224
|
name: bundler
|
225
225
|
requirement: !ruby/object:Gem::Requirement
|
@@ -334,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
334
334
|
- !ruby/object:Gem::Version
|
335
335
|
version: '0'
|
336
336
|
requirements: []
|
337
|
-
rubygems_version: 3.4.
|
337
|
+
rubygems_version: 3.4.21
|
338
338
|
signing_key:
|
339
339
|
specification_version: 4
|
340
340
|
summary: ETL helper library
|