data_collector 0.41.2 → 0.43.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/data_collector/ext/nokogiri.rb +36 -0
- data/lib/data_collector/input.rb +24 -0
- data/lib/data_collector/output.rb +1 -1
- data/lib/data_collector/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b2c8102350695f8cfc598856372d9efd4bfa8b9cf4c3f84110b6dfbfdff3ee62
|
4
|
+
data.tar.gz: 042bf4690116ba07f34459068e3afbc7e63f483c7e0518ecb1581833cb85b3ae
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ff4a4c20f156dadbd665ff330f9c9f057a20391b2fc4fbab45afe59bb3071fd69aaaf6c67521b37a3d5d61b581761fcb21432be405d402a3571c32e47ce86f1b
|
7
|
+
data.tar.gz: a52836f02b727c1ed679520492921bfed09892d79155bf56fa2b823792a04512cfae267d15c22210acc32a460a8f2ed2a350e6ece388e860a41b0fbb473e778d
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
3
|
+
|
4
|
+
# Mix-in of the Nokogiri XML Node Class that implements methods to convert
|
5
|
+
# nodes to a hash.
|
6
|
+
# Author: Mario "Kuroir" Ricalde (http://kuroir.com)
|
7
|
+
class Nokogiri::XML::Node
|
8
|
+
# Convert a selected node to a Hash. It accepts a CSS3 Selector as an attribute.
|
9
|
+
# Returns the hash.
|
10
|
+
def to_hash(selector = 'body > *')
|
11
|
+
hash = []
|
12
|
+
self.css(selector).each do |node|
|
13
|
+
hash << node.collect_nodes
|
14
|
+
end
|
15
|
+
# Return the hash
|
16
|
+
hash.map{|m| m.is_a?(Hash) ? m.with_indifferent_access : m}
|
17
|
+
end
|
18
|
+
|
19
|
+
# Recursive method to collect nodes.
|
20
|
+
# We add the children symbol always to keep the array structure constant.
|
21
|
+
def collect_nodes
|
22
|
+
{ self.name.to_sym => self.collect_attributes.merge({:children => collect_children }) }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Collect Attributes of a given node.
|
26
|
+
def collect_attributes
|
27
|
+
output = {}
|
28
|
+
self.attributes.each { |name, value| output = output.merge({ name.to_sym => value.to_s.split(/\s+/) }) }
|
29
|
+
output
|
30
|
+
end
|
31
|
+
|
32
|
+
# Priest method.
|
33
|
+
def collect_children
|
34
|
+
self.element_children.collect { |child| child.collect_nodes } || []
|
35
|
+
end
|
36
|
+
end
|
data/lib/data_collector/input.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'http'
|
3
3
|
require 'open-uri'
|
4
4
|
require 'nokogiri'
|
5
|
+
require_relative 'ext/nokogiri'
|
5
6
|
require 'linkeddata'
|
6
7
|
require 'nori'
|
7
8
|
require 'uri'
|
@@ -15,6 +16,7 @@ require 'csv'
|
|
15
16
|
require_relative 'input/dir'
|
16
17
|
require_relative 'input/queue'
|
17
18
|
require_relative 'input/rpc'
|
19
|
+
require 'base64'
|
18
20
|
|
19
21
|
# require_relative 'ext/xml_utility_node'
|
20
22
|
module DataCollector
|
@@ -166,11 +168,16 @@ module DataCollector
|
|
166
168
|
data = xml_to_hash(data, options)
|
167
169
|
when 'text/xml'
|
168
170
|
data = xml_to_hash(data, options)
|
171
|
+
when 'text/html'
|
172
|
+
data = html_to_hash(data, options)
|
169
173
|
when 'text/turtle'
|
170
174
|
graph = RDF::Graph.new do |graph|
|
171
175
|
RDF::Turtle::Reader.new(data) {|reader| graph << reader}
|
172
176
|
end
|
173
177
|
data = JSON.parse(graph.dump(:jsonld, validate: false, standard_prefixes: true))
|
178
|
+
when /^image/
|
179
|
+
options['file_type'] = file_type
|
180
|
+
data = image_to_data(data, options)
|
174
181
|
else
|
175
182
|
data = xml_to_hash(data, options)
|
176
183
|
end
|
@@ -196,6 +203,8 @@ module DataCollector
|
|
196
203
|
data = nil
|
197
204
|
uri = normalize_uri(uri)
|
198
205
|
absolute_path = File.absolute_path(uri)
|
206
|
+
file_type = MIME::Types.type_for(uri).first.to_s
|
207
|
+
options['file_type'] = file_type
|
199
208
|
raise DataCollector::Error, "#{uri.to_s} not found" unless File.exist?("#{absolute_path}")
|
200
209
|
unless options.has_key?('raw') && options['raw'] == true
|
201
210
|
@raw = data = File.read("#{absolute_path}")
|
@@ -206,6 +215,8 @@ module DataCollector
|
|
206
215
|
data = JSON.parse(data)
|
207
216
|
when '.xml'
|
208
217
|
data = xml_to_hash(data, options)
|
218
|
+
when '.html'
|
219
|
+
data = html_to_hash(data, options)
|
209
220
|
when '.gz'
|
210
221
|
tar_data = []
|
211
222
|
Minitar.open(Zlib::GzipReader.new(File.open("#{absolute_path}", 'rb'))) do |i|
|
@@ -232,6 +243,8 @@ module DataCollector
|
|
232
243
|
end #tar
|
233
244
|
when '.csv'
|
234
245
|
data = csv_to_hash(data, options)
|
246
|
+
when '.jpg', '.png', '.gif'
|
247
|
+
data = image_to_data(data, options)
|
235
248
|
else
|
236
249
|
raise "Do not know how to process #{uri.to_s}"
|
237
250
|
end
|
@@ -252,6 +265,11 @@ module DataCollector
|
|
252
265
|
DataCollector::Input::Rpc.new(uri, options)
|
253
266
|
end
|
254
267
|
|
268
|
+
def image_to_data(data, options = {})
|
269
|
+
file_type = options['file_type']
|
270
|
+
"data:#{file_type};#{Base64.encode64(data)}"
|
271
|
+
end
|
272
|
+
|
255
273
|
def xml_to_hash(data, options = {})
|
256
274
|
# gsub('<\/', '< /') outherwise wrong XML-parsing (see records lirias1729192 )
|
257
275
|
return unless data.is_a?(String)
|
@@ -264,6 +282,12 @@ module DataCollector
|
|
264
282
|
nori.parse(data)
|
265
283
|
end
|
266
284
|
|
285
|
+
def html_to_hash(data, options = {})
|
286
|
+
return unless data.is_a?(String)
|
287
|
+
html_data = Nokogiri::HTML(data)
|
288
|
+
html_data.to_hash
|
289
|
+
end
|
290
|
+
|
267
291
|
def csv_to_hash(data, options = {})
|
268
292
|
csv_option_keys = options.keys & CSV::DEFAULT_OPTIONS.keys
|
269
293
|
all_cvs_options = {headers: true, header_converters: [:downcase, :symbol]}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_collector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.43.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Celik
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -311,6 +311,7 @@ files:
|
|
311
311
|
- lib/data_collector/config_file.rb
|
312
312
|
- lib/data_collector/core.rb
|
313
313
|
- lib/data_collector/ext/base.rb
|
314
|
+
- lib/data_collector/ext/nokogiri.rb
|
314
315
|
- lib/data_collector/ext/xml_utility_node.rb
|
315
316
|
- lib/data_collector/input.rb
|
316
317
|
- lib/data_collector/input/dir.rb
|