data_collector 0.41.1 → 0.42.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7129e581e83ed3d1d2ac2a7df0f755d55b278c99a3caf83ee1d1b93ff143b440
4
- data.tar.gz: 2914af4cd633fa592a770d579e8d27d5d69a9c0559829161ff210a26040fe498
3
+ metadata.gz: 882f493242fb171054818f0afe724c6389920c1a9590777a3d693a5638a3a0c3
4
+ data.tar.gz: 28b47fcd8de7c19d407cfeb49d7ca24a2f9199f88dfa167dce39fbb86fb3744c
5
5
  SHA512:
6
- metadata.gz: 2629bcf0144699d9196a28884d30af0db03a0afd28392114e66df80e924a2e1945005f0c0ead108e015487c78050d7485d03a123a6822635413462f09c3f3930
7
- data.tar.gz: 927b22389fcebad4e66f446a5ed131b85e05fde2aa6fa3343f3382112c5eed45f4a6c787f4d845126fc9c46f66cf4a7c9bb9abec170ec49f200fda41b3b36274
6
+ metadata.gz: 91c0b362b95077a286ad47ed526889e10a053dd93c691963f2957350e9d5b956d4703d893bcce507db6f60feb35f5a8026d260e90c83bbcd7ceb657143be6df9
7
+ data.tar.gz: 6a4c1340f09916a83a351330ae7ede7f4b0538a5f8aae84ea365409dafb66a033df421baf27282dc22ccedee7c165f0a5e1365a69e680c127b2c6bbacfb7a045
@@ -1,37 +1,41 @@
1
- require 'connection_pool'
1
+ require 'securerandom'
2
+ #require 'connection_pool'
2
3
 
3
4
  module BunnyBurrow
4
- class Connection
5
- include Singleton
6
- attr_reader :connection
7
- attr_accessor :verify_peer, :connection_name, :rabbitmq_url
8
-
9
- def connection
10
- unless @connection
11
- @connection = Bunny.new(@rabbitmq_url, verify_peer: @verify_peer, connection_name: @connection_name)
12
- @connection.start
13
- end
14
-
15
- @connection.start unless @connection.connected? || @connection.closed?
16
- #@connection.start if @connection.closed?
17
-
18
- #pp @connection.status
19
-
20
- @connection
21
- end
22
-
23
- def channel
24
- @channel = connection.create_channel unless @channel && @channel.open?
25
-
26
- @channel
27
- end
28
- end
5
+ # class Connection
6
+ # include Singleton
7
+ # attr_reader :connection
8
+ # attr_accessor :verify_peer, :connection_name, :rabbitmq_url
9
+ #
10
+ # def connection
11
+ # unless @connection
12
+ # @connection = Bunny.new(@rabbitmq_url, verify_peer: @verify_peer, connection_name: @connection_name)
13
+ # @connection.start
14
+ # end
15
+ #
16
+ # @connection.start unless @connection.connected? || @connection.closed?
17
+ # #@connection.start if @connection.closed?
18
+ #
19
+ # #pp @connection.status
20
+ #
21
+ # @connection
22
+ # end
23
+ #
24
+ # def channel
25
+ # @channel = connection.create_channel unless @channel && @channel.open?
26
+ #
27
+ # @channel
28
+ # end
29
+ # end
29
30
 
30
31
  class Base
31
32
  attr_accessor(
32
33
  :connection_name
33
34
  )
34
35
 
36
+ def initialize
37
+ @connection_name = "#{Process.pid}-#{SecureRandom.uuid}"
38
+ end
35
39
  # private
36
40
  #
37
41
  # def connection
@@ -0,0 +1,36 @@
1
+ require 'nokogiri'
2
+ require 'active_support/core_ext/hash/indifferent_access'
3
+
4
+ # Mix-in of the Nokogiri XML Node Class that implements methods to convert
5
+ # nodes to a hash.
6
+ # Author: Mario "Kuroir" Ricalde (http://kuroir.com)
7
+ class Nokogiri::XML::Node
8
+ # Convert a selected node to a Hash. It accepts a CSS3 Selector as an attribute.
9
+ # Returns the hash.
10
+ def to_hash(selector = 'body > *')
11
+ hash = []
12
+ self.css(selector).each do |node|
13
+ hash << node.collect_nodes
14
+ end
15
+ # Return the hash
16
+ hash.map{|m| m.is_a?(Hash) ? m.with_indifferent_access : m}
17
+ end
18
+
19
+ # Recursive method to collect nodes.
20
+ # We add the children symbol always to keep the array structure constant.
21
+ def collect_nodes
22
+ { self.name.to_sym => self.collect_attributes.merge({:children => collect_children }) }
23
+ end
24
+
25
+ # Collect Attributes of a given node.
26
+ def collect_attributes
27
+ output = {}
28
+ self.attributes.each { |name, value| output = output.merge({ name.to_sym => value.to_s.split(/\s+/) }) }
29
+ output
30
+ end
31
+
32
+ # Priest method.
33
+ def collect_children
34
+ self.element_children.collect { |child| child.collect_nodes } || []
35
+ end
36
+ end
@@ -2,6 +2,7 @@
2
2
  require 'http'
3
3
  require 'open-uri'
4
4
  require 'nokogiri'
5
+ require_relative 'ext/nokogiri'
5
6
  require 'linkeddata'
6
7
  require 'nori'
7
8
  require 'uri'
@@ -15,6 +16,7 @@ require 'csv'
15
16
  require_relative 'input/dir'
16
17
  require_relative 'input/queue'
17
18
  require_relative 'input/rpc'
19
+ require 'base64'
18
20
 
19
21
  # require_relative 'ext/xml_utility_node'
20
22
  module DataCollector
@@ -166,11 +168,16 @@ module DataCollector
166
168
  data = xml_to_hash(data, options)
167
169
  when 'text/xml'
168
170
  data = xml_to_hash(data, options)
171
+ when 'text/html'
172
+ data = html_to_hash(data, options)
169
173
  when 'text/turtle'
170
174
  graph = RDF::Graph.new do |graph|
171
175
  RDF::Turtle::Reader.new(data) {|reader| graph << reader}
172
176
  end
173
177
  data = JSON.parse(graph.dump(:jsonld, validate: false, standard_prefixes: true))
178
+ when /^image/
179
+ options['file_type'] = file_type
180
+ data = image_to_data(data, options)
174
181
  else
175
182
  data = xml_to_hash(data, options)
176
183
  end
@@ -196,6 +203,8 @@ module DataCollector
196
203
  data = nil
197
204
  uri = normalize_uri(uri)
198
205
  absolute_path = File.absolute_path(uri)
206
+ file_type = MIME::Types.type_for(uri).first.to_s
207
+ options['file_type'] = file_type
199
208
  raise DataCollector::Error, "#{uri.to_s} not found" unless File.exist?("#{absolute_path}")
200
209
  unless options.has_key?('raw') && options['raw'] == true
201
210
  @raw = data = File.read("#{absolute_path}")
@@ -206,6 +215,8 @@ module DataCollector
206
215
  data = JSON.parse(data)
207
216
  when '.xml'
208
217
  data = xml_to_hash(data, options)
218
+ when '.html'
219
+ data = html_to_hash(data, options)
209
220
  when '.gz'
210
221
  tar_data = []
211
222
  Minitar.open(Zlib::GzipReader.new(File.open("#{absolute_path}", 'rb'))) do |i|
@@ -232,6 +243,8 @@ module DataCollector
232
243
  end #tar
233
244
  when '.csv'
234
245
  data = csv_to_hash(data, options)
246
+ when '.jpg', '.png', '.gif'
247
+ data = image_to_data(data, options)
235
248
  else
236
249
  raise "Do not know how to process #{uri.to_s}"
237
250
  end
@@ -252,6 +265,11 @@ module DataCollector
252
265
  DataCollector::Input::Rpc.new(uri, options)
253
266
  end
254
267
 
268
+ def image_to_data(data, options = {})
269
+ file_type = options['file_type']
270
+ "data:#{file_type};#{Base64.encode64(data)}"
271
+ end
272
+
255
273
  def xml_to_hash(data, options = {})
256
274
  # gsub('&lt;\/', '&lt; /') outherwise wrong XML-parsing (see records lirias1729192 )
257
275
  return unless data.is_a?(String)
@@ -264,6 +282,12 @@ module DataCollector
264
282
  nori.parse(data)
265
283
  end
266
284
 
285
+ def html_to_hash(data, options = {})
286
+ return unless data.is_a?(String)
287
+ html_data = Nokogiri::HTML(data)
288
+ html_data.to_hash
289
+ end
290
+
267
291
  def csv_to_hash(data, options = {})
268
292
  csv_option_keys = options.keys & CSV::DEFAULT_OPTIONS.keys
269
293
  all_cvs_options = {headers: true, header_converters: [:downcase, :symbol]}
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module DataCollector
3
- VERSION = "0.41.1"
3
+ VERSION = "0.42.0"
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_collector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.41.1
4
+ version: 0.42.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mehmet Celik
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-04 00:00:00.000000000 Z
11
+ date: 2024-01-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -311,6 +311,7 @@ files:
311
311
  - lib/data_collector/config_file.rb
312
312
  - lib/data_collector/core.rb
313
313
  - lib/data_collector/ext/base.rb
314
+ - lib/data_collector/ext/nokogiri.rb
314
315
  - lib/data_collector/ext/xml_utility_node.rb
315
316
  - lib/data_collector/input.rb
316
317
  - lib/data_collector/input/dir.rb