data_collector 0.41.2 → 0.42.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6e47e1ddfa862a19d96aab23718b06aae7b0a84c9719a0662ea18ecae0f09c6b
4
- data.tar.gz: 115717172899307b66783f1b8aa32615590d9fdf44d3d944cf532a5d9a941be2
3
+ metadata.gz: 882f493242fb171054818f0afe724c6389920c1a9590777a3d693a5638a3a0c3
4
+ data.tar.gz: 28b47fcd8de7c19d407cfeb49d7ca24a2f9199f88dfa167dce39fbb86fb3744c
5
5
  SHA512:
6
- metadata.gz: 3ee069f8134c3d90bc3832aaafd09812182f4b6e13f38dbbc18776d18f15f9de04f1d2d049ae61bb3b77fb2777a8fd33ce4ca933be546d83eac77a5d18baeae6
7
- data.tar.gz: 83dfc12faa94aeb9aeecfd40be6c1ed74296f704f4650334dfc9099f6201671d4796c2166220160004c7fbb5a060ef09e5f29e9380c15fe953cae29931df8441
6
+ metadata.gz: 91c0b362b95077a286ad47ed526889e10a053dd93c691963f2957350e9d5b956d4703d893bcce507db6f60feb35f5a8026d260e90c83bbcd7ceb657143be6df9
7
+ data.tar.gz: 6a4c1340f09916a83a351330ae7ede7f4b0538a5f8aae84ea365409dafb66a033df421baf27282dc22ccedee7c165f0a5e1365a69e680c127b2c6bbacfb7a045
@@ -0,0 +1,36 @@
1
+ require 'nokogiri'
2
+ require 'active_support/core_ext/hash/indifferent_access'
3
+
4
+ # Mix-in of the Nokogiri XML Node Class that implements methods to convert
5
+ # nodes to a hash.
6
+ # Author: Mario "Kuroir" Ricalde (http://kuroir.com)
7
+ class Nokogiri::XML::Node
8
+ # Convert a selected node to a Hash. It accepts a CSS3 Selector as an attribute.
9
+ # Returns the hash.
10
+ def to_hash(selector = 'body > *')
11
+ hash = []
12
+ self.css(selector).each do |node|
13
+ hash << node.collect_nodes
14
+ end
15
+ # Return the hash
16
+ hash.map{|m| m.is_a?(Hash) ? m.with_indifferent_access : m}
17
+ end
18
+
19
+ # Recursive method to collect nodes.
20
+ # We add the children symbol always to keep the array structure constant.
21
+ def collect_nodes
22
+ { self.name.to_sym => self.collect_attributes.merge({:children => collect_children }) }
23
+ end
24
+
25
+ # Collect Attributes of a given node.
26
+ def collect_attributes
27
+ output = {}
28
+ self.attributes.each { |name, value| output = output.merge({ name.to_sym => value.to_s.split(/\s+/) }) }
29
+ output
30
+ end
31
+
32
+ # Priest method.
33
+ def collect_children
34
+ self.element_children.collect { |child| child.collect_nodes } || []
35
+ end
36
+ end
@@ -2,6 +2,7 @@
2
2
  require 'http'
3
3
  require 'open-uri'
4
4
  require 'nokogiri'
5
+ require_relative 'ext/nokogiri'
5
6
  require 'linkeddata'
6
7
  require 'nori'
7
8
  require 'uri'
@@ -15,6 +16,7 @@ require 'csv'
15
16
  require_relative 'input/dir'
16
17
  require_relative 'input/queue'
17
18
  require_relative 'input/rpc'
19
+ require 'base64'
18
20
 
19
21
  # require_relative 'ext/xml_utility_node'
20
22
  module DataCollector
@@ -166,11 +168,16 @@ module DataCollector
166
168
  data = xml_to_hash(data, options)
167
169
  when 'text/xml'
168
170
  data = xml_to_hash(data, options)
171
+ when 'text/html'
172
+ data = html_to_hash(data, options)
169
173
  when 'text/turtle'
170
174
  graph = RDF::Graph.new do |graph|
171
175
  RDF::Turtle::Reader.new(data) {|reader| graph << reader}
172
176
  end
173
177
  data = JSON.parse(graph.dump(:jsonld, validate: false, standard_prefixes: true))
178
+ when /^image/
179
+ options['file_type'] = file_type
180
+ data = image_to_data(data, options)
174
181
  else
175
182
  data = xml_to_hash(data, options)
176
183
  end
@@ -196,6 +203,8 @@ module DataCollector
196
203
  data = nil
197
204
  uri = normalize_uri(uri)
198
205
  absolute_path = File.absolute_path(uri)
206
+ file_type = MIME::Types.type_for(uri).first.to_s
207
+ options['file_type'] = file_type
199
208
  raise DataCollector::Error, "#{uri.to_s} not found" unless File.exist?("#{absolute_path}")
200
209
  unless options.has_key?('raw') && options['raw'] == true
201
210
  @raw = data = File.read("#{absolute_path}")
@@ -206,6 +215,8 @@ module DataCollector
206
215
  data = JSON.parse(data)
207
216
  when '.xml'
208
217
  data = xml_to_hash(data, options)
218
+ when '.html'
219
+ data = html_to_hash(data, options)
209
220
  when '.gz'
210
221
  tar_data = []
211
222
  Minitar.open(Zlib::GzipReader.new(File.open("#{absolute_path}", 'rb'))) do |i|
@@ -232,6 +243,8 @@ module DataCollector
232
243
  end #tar
233
244
  when '.csv'
234
245
  data = csv_to_hash(data, options)
246
+ when '.jpg', '.png', '.gif'
247
+ data = image_to_data(data, options)
235
248
  else
236
249
  raise "Do not know how to process #{uri.to_s}"
237
250
  end
@@ -252,6 +265,11 @@ module DataCollector
252
265
  DataCollector::Input::Rpc.new(uri, options)
253
266
  end
254
267
 
268
+ def image_to_data(data, options = {})
269
+ file_type = options['file_type']
270
+ "data:#{file_type};#{Base64.encode64(data)}"
271
+ end
272
+
255
273
  def xml_to_hash(data, options = {})
256
274
  # gsub('&lt;\/', '&lt; /') outherwise wrong XML-parsing (see records lirias1729192 )
257
275
  return unless data.is_a?(String)
@@ -264,6 +282,12 @@ module DataCollector
264
282
  nori.parse(data)
265
283
  end
266
284
 
285
+ def html_to_hash(data, options = {})
286
+ return unless data.is_a?(String)
287
+ html_data = Nokogiri::HTML(data)
288
+ html_data.to_hash
289
+ end
290
+
267
291
  def csv_to_hash(data, options = {})
268
292
  csv_option_keys = options.keys & CSV::DEFAULT_OPTIONS.keys
269
293
  all_cvs_options = {headers: true, header_converters: [:downcase, :symbol]}
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module DataCollector
3
- VERSION = "0.41.2"
3
+ VERSION = "0.42.0"
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_collector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.41.2
4
+ version: 0.42.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mehmet Celik
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-04 00:00:00.000000000 Z
11
+ date: 2024-01-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -311,6 +311,7 @@ files:
311
311
  - lib/data_collector/config_file.rb
312
312
  - lib/data_collector/core.rb
313
313
  - lib/data_collector/ext/base.rb
314
+ - lib/data_collector/ext/nokogiri.rb
314
315
  - lib/data_collector/ext/xml_utility_node.rb
315
316
  - lib/data_collector/input.rb
316
317
  - lib/data_collector/input/dir.rb