data_collector 0.41.2 → 0.42.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6e47e1ddfa862a19d96aab23718b06aae7b0a84c9719a0662ea18ecae0f09c6b
4
- data.tar.gz: 115717172899307b66783f1b8aa32615590d9fdf44d3d944cf532a5d9a941be2
3
+ metadata.gz: 882f493242fb171054818f0afe724c6389920c1a9590777a3d693a5638a3a0c3
4
+ data.tar.gz: 28b47fcd8de7c19d407cfeb49d7ca24a2f9199f88dfa167dce39fbb86fb3744c
5
5
  SHA512:
6
- metadata.gz: 3ee069f8134c3d90bc3832aaafd09812182f4b6e13f38dbbc18776d18f15f9de04f1d2d049ae61bb3b77fb2777a8fd33ce4ca933be546d83eac77a5d18baeae6
7
- data.tar.gz: 83dfc12faa94aeb9aeecfd40be6c1ed74296f704f4650334dfc9099f6201671d4796c2166220160004c7fbb5a060ef09e5f29e9380c15fe953cae29931df8441
6
+ metadata.gz: 91c0b362b95077a286ad47ed526889e10a053dd93c691963f2957350e9d5b956d4703d893bcce507db6f60feb35f5a8026d260e90c83bbcd7ceb657143be6df9
7
+ data.tar.gz: 6a4c1340f09916a83a351330ae7ede7f4b0538a5f8aae84ea365409dafb66a033df421baf27282dc22ccedee7c165f0a5e1365a69e680c127b2c6bbacfb7a045
@@ -0,0 +1,36 @@
1
+ require 'nokogiri'
2
+ require 'active_support/core_ext/hash/indifferent_access'
3
+
4
+ # Mix-in of the Nokogiri XML Node Class that implements methods to convert
5
+ # nodes to a hash.
6
+ # Author: Mario "Kuroir" Ricalde (http://kuroir.com)
7
+ class Nokogiri::XML::Node
8
+ # Convert a selected node to a Hash. It accepts a CSS3 Selector as an attribute.
9
+ # Returns the hash.
10
+ def to_hash(selector = 'body > *')
11
+ hash = []
12
+ self.css(selector).each do |node|
13
+ hash << node.collect_nodes
14
+ end
15
+ # Return the hash
16
+ hash.map{|m| m.is_a?(Hash) ? m.with_indifferent_access : m}
17
+ end
18
+
19
+ # Recursive method to collect nodes.
20
+ # We add the children symbol always to keep the array structure constant.
21
+ def collect_nodes
22
+ { self.name.to_sym => self.collect_attributes.merge({:children => collect_children }) }
23
+ end
24
+
25
+ # Collect Attributes of a given node.
26
+ def collect_attributes
27
+ output = {}
28
+ self.attributes.each { |name, value| output = output.merge({ name.to_sym => value.to_s.split(/\s+/) }) }
29
+ output
30
+ end
31
+
32
+ # Priest method.
33
+ def collect_children
34
+ self.element_children.collect { |child| child.collect_nodes } || []
35
+ end
36
+ end
@@ -2,6 +2,7 @@
2
2
  require 'http'
3
3
  require 'open-uri'
4
4
  require 'nokogiri'
5
+ require_relative 'ext/nokogiri'
5
6
  require 'linkeddata'
6
7
  require 'nori'
7
8
  require 'uri'
@@ -15,6 +16,7 @@ require 'csv'
15
16
  require_relative 'input/dir'
16
17
  require_relative 'input/queue'
17
18
  require_relative 'input/rpc'
19
+ require 'base64'
18
20
 
19
21
  # require_relative 'ext/xml_utility_node'
20
22
  module DataCollector
@@ -166,11 +168,16 @@ module DataCollector
166
168
  data = xml_to_hash(data, options)
167
169
  when 'text/xml'
168
170
  data = xml_to_hash(data, options)
171
+ when 'text/html'
172
+ data = html_to_hash(data, options)
169
173
  when 'text/turtle'
170
174
  graph = RDF::Graph.new do |graph|
171
175
  RDF::Turtle::Reader.new(data) {|reader| graph << reader}
172
176
  end
173
177
  data = JSON.parse(graph.dump(:jsonld, validate: false, standard_prefixes: true))
178
+ when /^image/
179
+ options['file_type'] = file_type
180
+ data = image_to_data(data, options)
174
181
  else
175
182
  data = xml_to_hash(data, options)
176
183
  end
@@ -196,6 +203,8 @@ module DataCollector
196
203
  data = nil
197
204
  uri = normalize_uri(uri)
198
205
  absolute_path = File.absolute_path(uri)
206
+ file_type = MIME::Types.type_for(uri).first.to_s
207
+ options['file_type'] = file_type
199
208
  raise DataCollector::Error, "#{uri.to_s} not found" unless File.exist?("#{absolute_path}")
200
209
  unless options.has_key?('raw') && options['raw'] == true
201
210
  @raw = data = File.read("#{absolute_path}")
@@ -206,6 +215,8 @@ module DataCollector
206
215
  data = JSON.parse(data)
207
216
  when '.xml'
208
217
  data = xml_to_hash(data, options)
218
+ when '.html'
219
+ data = html_to_hash(data, options)
209
220
  when '.gz'
210
221
  tar_data = []
211
222
  Minitar.open(Zlib::GzipReader.new(File.open("#{absolute_path}", 'rb'))) do |i|
@@ -232,6 +243,8 @@ module DataCollector
232
243
  end #tar
233
244
  when '.csv'
234
245
  data = csv_to_hash(data, options)
246
+ when '.jpg', '.png', '.gif'
247
+ data = image_to_data(data, options)
235
248
  else
236
249
  raise "Do not know how to process #{uri.to_s}"
237
250
  end
@@ -252,6 +265,11 @@ module DataCollector
252
265
  DataCollector::Input::Rpc.new(uri, options)
253
266
  end
254
267
 
268
+ def image_to_data(data, options = {})
269
+ file_type = options['file_type']
270
+ "data:#{file_type};#{Base64.encode64(data)}"
271
+ end
272
+
255
273
  def xml_to_hash(data, options = {})
256
274
  # gsub('&lt;\/', '&lt; /') outherwise wrong XML-parsing (see records lirias1729192 )
257
275
  return unless data.is_a?(String)
@@ -264,6 +282,12 @@ module DataCollector
264
282
  nori.parse(data)
265
283
  end
266
284
 
285
+ def html_to_hash(data, options = {})
286
+ return unless data.is_a?(String)
287
+ html_data = Nokogiri::HTML(data)
288
+ html_data.to_hash
289
+ end
290
+
267
291
  def csv_to_hash(data, options = {})
268
292
  csv_option_keys = options.keys & CSV::DEFAULT_OPTIONS.keys
269
293
  all_cvs_options = {headers: true, header_converters: [:downcase, :symbol]}
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module DataCollector
3
- VERSION = "0.41.2"
3
+ VERSION = "0.42.0"
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_collector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.41.2
4
+ version: 0.42.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mehmet Celik
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-04 00:00:00.000000000 Z
11
+ date: 2024-01-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -311,6 +311,7 @@ files:
311
311
  - lib/data_collector/config_file.rb
312
312
  - lib/data_collector/core.rb
313
313
  - lib/data_collector/ext/base.rb
314
+ - lib/data_collector/ext/nokogiri.rb
314
315
  - lib/data_collector/ext/xml_utility_node.rb
315
316
  - lib/data_collector/input.rb
316
317
  - lib/data_collector/input/dir.rb