data_collector 0.41.2 → 0.42.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/data_collector/ext/nokogiri.rb +36 -0
- data/lib/data_collector/input.rb +24 -0
- data/lib/data_collector/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 882f493242fb171054818f0afe724c6389920c1a9590777a3d693a5638a3a0c3
|
4
|
+
data.tar.gz: 28b47fcd8de7c19d407cfeb49d7ca24a2f9199f88dfa167dce39fbb86fb3744c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 91c0b362b95077a286ad47ed526889e10a053dd93c691963f2957350e9d5b956d4703d893bcce507db6f60feb35f5a8026d260e90c83bbcd7ceb657143be6df9
|
7
|
+
data.tar.gz: 6a4c1340f09916a83a351330ae7ede7f4b0538a5f8aae84ea365409dafb66a033df421baf27282dc22ccedee7c165f0a5e1365a69e680c127b2c6bbacfb7a045
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
3
|
+
|
4
|
+
# Mix-in of the Nokogiri XML Node Class that implements methods to convert
|
5
|
+
# nodes to a hash.
|
6
|
+
# Author: Mario "Kuroir" Ricalde (http://kuroir.com)
|
7
|
+
class Nokogiri::XML::Node
|
8
|
+
# Convert a selected node to a Hash. It accepts a CSS3 Selector as an attribute.
|
9
|
+
# Returns the hash.
|
10
|
+
def to_hash(selector = 'body > *')
|
11
|
+
hash = []
|
12
|
+
self.css(selector).each do |node|
|
13
|
+
hash << node.collect_nodes
|
14
|
+
end
|
15
|
+
# Return the hash
|
16
|
+
hash.map{|m| m.is_a?(Hash) ? m.with_indifferent_access : m}
|
17
|
+
end
|
18
|
+
|
19
|
+
# Recursive method to collect nodes.
|
20
|
+
# We add the children symbol always to keep the array structure constant.
|
21
|
+
def collect_nodes
|
22
|
+
{ self.name.to_sym => self.collect_attributes.merge({:children => collect_children }) }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Collect Attributes of a given node.
|
26
|
+
def collect_attributes
|
27
|
+
output = {}
|
28
|
+
self.attributes.each { |name, value| output = output.merge({ name.to_sym => value.to_s.split(/\s+/) }) }
|
29
|
+
output
|
30
|
+
end
|
31
|
+
|
32
|
+
# Priest method.
|
33
|
+
def collect_children
|
34
|
+
self.element_children.collect { |child| child.collect_nodes } || []
|
35
|
+
end
|
36
|
+
end
|
data/lib/data_collector/input.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
require 'http'
|
3
3
|
require 'open-uri'
|
4
4
|
require 'nokogiri'
|
5
|
+
require_relative 'ext/nokogiri'
|
5
6
|
require 'linkeddata'
|
6
7
|
require 'nori'
|
7
8
|
require 'uri'
|
@@ -15,6 +16,7 @@ require 'csv'
|
|
15
16
|
require_relative 'input/dir'
|
16
17
|
require_relative 'input/queue'
|
17
18
|
require_relative 'input/rpc'
|
19
|
+
require 'base64'
|
18
20
|
|
19
21
|
# require_relative 'ext/xml_utility_node'
|
20
22
|
module DataCollector
|
@@ -166,11 +168,16 @@ module DataCollector
|
|
166
168
|
data = xml_to_hash(data, options)
|
167
169
|
when 'text/xml'
|
168
170
|
data = xml_to_hash(data, options)
|
171
|
+
when 'text/html'
|
172
|
+
data = html_to_hash(data, options)
|
169
173
|
when 'text/turtle'
|
170
174
|
graph = RDF::Graph.new do |graph|
|
171
175
|
RDF::Turtle::Reader.new(data) {|reader| graph << reader}
|
172
176
|
end
|
173
177
|
data = JSON.parse(graph.dump(:jsonld, validate: false, standard_prefixes: true))
|
178
|
+
when /^image/
|
179
|
+
options['file_type'] = file_type
|
180
|
+
data = image_to_data(data, options)
|
174
181
|
else
|
175
182
|
data = xml_to_hash(data, options)
|
176
183
|
end
|
@@ -196,6 +203,8 @@ module DataCollector
|
|
196
203
|
data = nil
|
197
204
|
uri = normalize_uri(uri)
|
198
205
|
absolute_path = File.absolute_path(uri)
|
206
|
+
file_type = MIME::Types.type_for(uri).first.to_s
|
207
|
+
options['file_type'] = file_type
|
199
208
|
raise DataCollector::Error, "#{uri.to_s} not found" unless File.exist?("#{absolute_path}")
|
200
209
|
unless options.has_key?('raw') && options['raw'] == true
|
201
210
|
@raw = data = File.read("#{absolute_path}")
|
@@ -206,6 +215,8 @@ module DataCollector
|
|
206
215
|
data = JSON.parse(data)
|
207
216
|
when '.xml'
|
208
217
|
data = xml_to_hash(data, options)
|
218
|
+
when '.html'
|
219
|
+
data = html_to_hash(data, options)
|
209
220
|
when '.gz'
|
210
221
|
tar_data = []
|
211
222
|
Minitar.open(Zlib::GzipReader.new(File.open("#{absolute_path}", 'rb'))) do |i|
|
@@ -232,6 +243,8 @@ module DataCollector
|
|
232
243
|
end #tar
|
233
244
|
when '.csv'
|
234
245
|
data = csv_to_hash(data, options)
|
246
|
+
when '.jpg', '.png', '.gif'
|
247
|
+
data = image_to_data(data, options)
|
235
248
|
else
|
236
249
|
raise "Do not know how to process #{uri.to_s}"
|
237
250
|
end
|
@@ -252,6 +265,11 @@ module DataCollector
|
|
252
265
|
DataCollector::Input::Rpc.new(uri, options)
|
253
266
|
end
|
254
267
|
|
268
|
+
def image_to_data(data, options = {})
|
269
|
+
file_type = options['file_type']
|
270
|
+
"data:#{file_type};#{Base64.encode64(data)}"
|
271
|
+
end
|
272
|
+
|
255
273
|
def xml_to_hash(data, options = {})
|
256
274
|
# gsub('<\/', '< /') outherwise wrong XML-parsing (see records lirias1729192 )
|
257
275
|
return unless data.is_a?(String)
|
@@ -264,6 +282,12 @@ module DataCollector
|
|
264
282
|
nori.parse(data)
|
265
283
|
end
|
266
284
|
|
285
|
+
def html_to_hash(data, options = {})
|
286
|
+
return unless data.is_a?(String)
|
287
|
+
html_data = Nokogiri::HTML(data)
|
288
|
+
html_data.to_hash
|
289
|
+
end
|
290
|
+
|
267
291
|
def csv_to_hash(data, options = {})
|
268
292
|
csv_option_keys = options.keys & CSV::DEFAULT_OPTIONS.keys
|
269
293
|
all_cvs_options = {headers: true, header_converters: [:downcase, :symbol]}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_collector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.42.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Celik
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -311,6 +311,7 @@ files:
|
|
311
311
|
- lib/data_collector/config_file.rb
|
312
312
|
- lib/data_collector/core.rb
|
313
313
|
- lib/data_collector/ext/base.rb
|
314
|
+
- lib/data_collector/ext/nokogiri.rb
|
314
315
|
- lib/data_collector/ext/xml_utility_node.rb
|
315
316
|
- lib/data_collector/input.rb
|
316
317
|
- lib/data_collector/input/dir.rb
|