data_collector 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 8410e7e260bf1989dfee0d4279b4c7d57dcd6276a5219c3bfb48663659044436
4
+ data.tar.gz: 0d0e78c365f927287ea7875392e83ac946da4e9ca8a764e86dc34d0e03e195b5
5
+ SHA512:
6
+ metadata.gz: 72ed7c28c8f9513be5ff136827dcf07336b1fee931c17049448c8c915530e919a278dd6e478119dac3c8fb1c20e98cce1314cdbd33dd71110cb07e0fc7ab7286
7
+ data.tar.gz: 20ea6369d094f13ce763305369037591a5c3412562fe350fccd58b9f4de8bfd5d37b830e49214d835c2ef039ca2252bac001ea41f571bde2ab51302e3d4c3146
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.idea/
2
+ /.bundle/
3
+ /.yardoc
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.gem
11
+ Gemfile.lock
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ sudo: false
3
+ language: ruby
4
+ cache: bundler
5
+ rvm:
6
+ - 2.6.1
7
+ before_install: gem install bundler -v 2.0.1
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in data_collector.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2019 Mehmet Celik
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,174 @@
1
+ # DataCollector
2
+ Convinience module to Extract, Transform and Load your data.
3
+
4
+ #### input
5
+ Read input from an URI
6
+ example:
7
+ ```ruby
8
+ input.from_uri("http://www.libis.be")
9
+ input.from_uri("file://hello.txt")
10
+ ```
11
+
12
+ Inputs can be JSON, XML or CSV
13
+
14
+ ### output
15
+ Output is an object you can store data that needs to be written to an output stream.
16
+ ```ruby
17
+ output[:name] = 'John'
18
+ output[:last_name] = 'Doe'
19
+ ```
20
+
21
+ Write output to a file, string use an ERB file as a template
22
+ example:
23
+ ___test.erb___
24
+ ```ruby
25
+ <names>
26
+ <combined><%= data[:name] %> <%= data[:last_name] %></combined>
27
+ <%= print data, :name, :first_name %>
28
+ <%= print data, :last_name %>
29
+ </names>
30
+ ```
31
+ will produce
32
+ ```ruby
33
+ <names>
34
+ <combined>John Doe</combined>
35
+ <first_name>John</first_name>
36
+ <last_name>Doe</last_name>
37
+ </names>
38
+ ```
39
+
40
+ Into a variable
41
+ ```ruby
42
+ result = output.to_s("test.erb")
43
+ ```
44
+
45
+ Into a file stored in records dir
46
+ ```ruby
47
+ output.to_file("test.erb")
48
+ ```
49
+
50
+ Into a tar file stored in data
51
+ ```ruby
52
+ output.to_file("test.erb", "my_data.tar.gz")
53
+ ```
54
+
55
+ Other output methods
56
+ ```ruby
57
+ output.raw
58
+ output.clear
59
+ output.to_tmp_file("test.erb", "tmp_data")
60
+ output.to_jsonfile(data, "test")
61
+ output.flatten
62
+ ```
63
+
64
+ Into a temp directory
65
+ ```ruby
66
+ output.to_tmp_file("test.erb","directory")
67
+ ```
68
+
69
+ #### filter
70
+ filter data from a hash using [JsonPath](http://goessner.net/articles/JsonPath/index.html)
71
+
72
+ ```ruby
73
+ filtered_data = filter(data, "$..metadata.record")
74
+ ```
75
+
76
+ #### config
77
+ config is an object that points to "config.yml" you can read and/or store data to this object.
78
+
79
+ ___read___
80
+ ```ruby
81
+ config[:active]
82
+ ```
83
+ ___write___
84
+ ```ruby
85
+ config[:active] = false
86
+ ```
87
+ #### log
88
+ Log to stdout
89
+ ```ruby
90
+ log("hello world")
91
+ ```
92
+
93
+
94
+ ## Example
95
+ Input data ___test.csv___
96
+ ```csv
97
+ sequence, data
98
+ 1, apple
99
+ 2, banana
100
+ 3, peach
101
+ ```
102
+
103
+ Output template ___test.erb___
104
+ ```ruby
105
+ <data>
106
+ <% data[:record].each do |d| %>
107
+ <record sequence="<%= d[:sequence] %>">
108
+ <%= print d, :data %>
109
+ </record>
110
+ <% end %>
111
+ </data>
112
+ ```
113
+
114
+ ```ruby
115
+ require 'data_collector'
116
+ include DataCollector::Core
117
+
118
+ data = input.from_uri('file://test.csv')
119
+ data.map{ |m| m[:sequence] *=2; m }
120
+
121
+ output[:record]=data
122
+
123
+ puts output.to_s('test.erb')
124
+ ```
125
+
126
+ Should give as output
127
+ ```xml
128
+ <data>
129
+ <record sequence="11">
130
+ <data> apple</data>
131
+ </record>
132
+ <record sequence="22">
133
+ <data> banana</data>
134
+ </record>
135
+ <record sequence="33">
136
+ <data> peach</data>
137
+ </record>
138
+ </data>
139
+ ```
140
+
141
+
142
+ ## Installation
143
+
144
+ Add this line to your application's Gemfile:
145
+
146
+ ```ruby
147
+ gem 'data_collector'
148
+ ```
149
+
150
+ And then execute:
151
+
152
+ $ bundle
153
+
154
+ Or install it yourself as:
155
+
156
+ $ gem install data_collector
157
+
158
+ ## Usage
159
+
160
+ TODO: Write usage instructions here
161
+
162
+ ## Development
163
+
164
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
165
+
166
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
167
+
168
+ ## Contributing
169
+
170
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/data_collector.
171
+
172
+ ## License
173
+
174
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList["test/**/*_test.rb"]
8
+ end
9
+
10
+ task :default => :test
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "data_collector"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,52 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "data_collector/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "data_collector"
8
+ spec.version = DataCollector::VERSION
9
+ spec.authors = ["Mehmet Celik"]
10
+ spec.email = ["mehmet@celik.be"]
11
+
12
+ spec.summary = %q{ETL library}
13
+ spec.description = %q{INPUT, FILTER, OUTPUT data}
14
+ spec.homepage = "https://github.com/mehmetc/data_collector"
15
+ spec.license = "MIT"
16
+
17
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
19
+ if spec.respond_to?(:metadata)
20
+ #spec.metadata["allowed_push_host"] = "https://github.com/mehmetc/data_collect'"
21
+
22
+ spec.metadata["homepage_uri"] = spec.homepage
23
+ spec.metadata["source_code_uri"] = "https://github.com/mehmetc/data_collect"
24
+ spec.metadata["changelog_uri"] = "https://github.com/mehmetc/data_collect"
25
+ else
26
+ raise "RubyGems 2.0 or newer is required to protect against " \
27
+ "public gem pushes."
28
+ end
29
+
30
+ # Specify which files should be added to the gem when it is released.
31
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
32
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
33
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
34
+ end
35
+ spec.bindir = "exe"
36
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
37
+ spec.require_paths = ["lib"]
38
+
39
+ spec.add_runtime_dependency "nokogiri", "~> 1.10"
40
+ spec.add_runtime_dependency "json", "~> 2.2"
41
+ spec.add_runtime_dependency "jsonpath", "~> 1.0"
42
+ spec.add_runtime_dependency "nori", "~> 2.6"
43
+ spec.add_runtime_dependency "http", "~> 4.1"
44
+ spec.add_runtime_dependency "mime-types", "~> 3.2"
45
+ spec.add_runtime_dependency "minitar", "= 0.8"
46
+ spec.add_runtime_dependency "activesupport", "~> 5.2"
47
+ spec.add_runtime_dependency "redis", "~> 4.1"
48
+
49
+ spec.add_development_dependency "bundler", "~> 2.0"
50
+ spec.add_development_dependency "rake", "~> 10.0"
51
+ spec.add_development_dependency "minitest", "~> 5.0"
52
+ end
@@ -0,0 +1,11 @@
1
+ #encoding: UTF-8
2
+ require 'active_support/core_ext/hash'
3
+ require 'logger'
4
+
5
+ require 'data_collector/version'
6
+ require 'data_collector/runner'
7
+ require 'data_collector/ext/xml_utility_node'
8
+
9
+ module DataCollector
10
+ class Error < StandardError; end
11
+ end
@@ -0,0 +1,72 @@
1
+ #encoding: UTF-8
2
+
3
+ require 'yaml'
4
+
5
+ module DataCollector
6
+ class ConfigFile
7
+ @config = {}
8
+ @config_file_path = ''
9
+
10
+ def self.version
11
+ '0.0.1'
12
+ end
13
+
14
+ def self.path
15
+ @config_file_path
16
+ end
17
+
18
+ def self.path=(config_file_path)
19
+ @config_file_path = config_file_path
20
+ end
21
+
22
+ def self.[](key)
23
+ init
24
+ @config[key]
25
+ end
26
+
27
+ def self.[]=(key, value)
28
+ init
29
+ @config[key] = value
30
+ File.open("#{path}/config.yml", 'w') do |f|
31
+ f.puts @config.to_yaml
32
+ end
33
+ end
34
+
35
+ def self.include?(key)
36
+ init
37
+ @config.include?(key)
38
+ end
39
+
40
+
41
+ private_class_method def self.init
42
+ discover_config_file_path
43
+ if @config.empty?
44
+ config = YAML::load_file("#{path}/config.yml")
45
+ @config = process(config)
46
+ end
47
+ end
48
+
49
+
50
+ private_class_method def self.discover_config_file_path
51
+ if @config_file_path.nil? || @config_file_path.empty?
52
+ if File.exist?('config.yml')
53
+ @config_file_path = '.'
54
+ elsif File.exist?("config/config.yml")
55
+ @config_file_path = 'config'
56
+ end
57
+ end
58
+ end
59
+
60
+ private_class_method def self.process(config)
61
+ new_config = {}
62
+ config.each do |k, v|
63
+ if config[k].is_a?(Hash)
64
+ v = process(v)
65
+ end
66
+ new_config.store(k.to_sym, v)
67
+ end
68
+
69
+ new_config
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,76 @@
1
+ # encoding: utf-8
2
+ require 'jsonpath'
3
+
4
+ require_relative 'input'
5
+ require_relative 'output'
6
+ require_relative 'config_file'
7
+
8
+ module DataCollector
9
+ module Core
10
+ # Read input from an URI
11
+ # example: input.from_uri("http://www.libis.be")
12
+ # input.from_uri("file://hello.txt")
13
+ def input
14
+ @input ||= DataCollector::Input.new
15
+ end
16
+
17
+ # Output is an object you can store data that needs to be written to an output stream
18
+ # output[:name] = 'John'
19
+ # output[:last_name] = 'Doe'
20
+ #
21
+ # Write output to a file, string use an ERB file as a template
22
+ # example:
23
+ # test.erb
24
+ # <names>
25
+ # <combined><%= data[:name] %> <%= data[:last_name] %></combined>
26
+ # <%= print data, :name, :first_name %>
27
+ # <%= print data, :last_name %>
28
+ # </names>
29
+ #
30
+ # will produce
31
+ # <names>
32
+ # <combined>John Doe</combined>
33
+ # <first_name>John</first_name>
34
+ # <last_name>Doe</last_name>
35
+ # </names>
36
+ #
37
+ # Into a variable
38
+ # result = output.to_s("test.erb")
39
+ # Into a file stored in records dir
40
+ # output.to_file("test.erb")
41
+ # Into a tar file stored in data
42
+ # output.to_file("test.erb", "my_data.tar.gz")
43
+ # Into a temp directory
44
+ # output.to_tmp_file("test.erb","directory")
45
+ def output
46
+ @output ||= Output.new
47
+ end
48
+
49
+ # evaluator http://jsonpath.com/
50
+ # uitleg http://goessner.net/articles/JsonPath/index.html
51
+ def filter(data, filter_path)
52
+ filtered = []
53
+ if filter_path.is_a?(Array) && data.is_a?(Array)
54
+ filtered = data.map {|m| m.select {|k, v| filter_path.include?(k.to_sym)}}
55
+ elsif filter_path.is_a?(String)
56
+ filtered = JsonPath.on(data, filter_path)
57
+ end
58
+
59
+ filtered = [filtered] unless filtered.is_a?(Array)
60
+ filtered = filtered.first if filtered.length == 1 && filtered.first.is_a?(Array)
61
+
62
+ filtered
63
+ rescue StandardError => e
64
+ @logger.error("#{filter_path} failed: #{e.message}")
65
+ []
66
+ end
67
+
68
+ def config
69
+ @config ||= ConfigFile
70
+ end
71
+
72
+ def log(message)
73
+ @logger.info(message)
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,72 @@
1
+ require "date"
2
+ require "time"
3
+ require "yaml"
4
+ require "bigdecimal"
5
+
6
+ require "nori/string_with_attributes"
7
+ require "nori/string_io_file"
8
+
9
+ class Nori
10
+ class XMLUtilityNode
11
+ alias_method :old_to_hash, :to_hash
12
+ def to_hash
13
+ if @type == "file"
14
+ f = StringIOFile.new((@children.first || '').unpack('m').first)
15
+ f.original_filename = attributes['name'] || 'untitled'
16
+ f.content_type = attributes['content_type'] || 'application/octet-stream'
17
+ return { name => f }
18
+ end
19
+
20
+ if @text
21
+ t = typecast_value(inner_html)
22
+ t = advanced_typecasting(t) if t.is_a?(String) && @options[:advanced_typecasting]
23
+
24
+ if t.is_a?(String)
25
+
26
+ # if converter = @options[:convert_attributes_to]
27
+ # intermediate = attributes.map {|k, v| converter.call(k, v) }.flatten
28
+ # attributes = Hash[*intermediate]
29
+ # end
30
+
31
+ t = {"$text" => t}.merge(prefixed_attributes) unless attributes.empty?
32
+ end
33
+
34
+ return { name => t }
35
+ else
36
+ #change repeating groups into an array
37
+ groups = @children.inject({}) { |s,e| (s[e.name] ||= []) << e; s }
38
+
39
+ out = nil
40
+ if @type == "array"
41
+ out = []
42
+ groups.each do |k, v|
43
+ if v.size == 1
44
+ out << v.first.to_hash.entries.first.last
45
+ else
46
+ out << v.map{|e| e.to_hash[k]}
47
+ end
48
+ end
49
+ out = out.flatten
50
+
51
+ else # If Hash
52
+ out = {}
53
+ groups.each do |k,v|
54
+ if v.size == 1
55
+ out.merge!(v.first)
56
+ else
57
+ out.merge!( k => v.map{|e| e.to_hash[k]})
58
+ end
59
+ end
60
+ out.merge! prefixed_attributes unless attributes.empty?
61
+ out = out.empty? ? @options[:empty_tag_value] : out
62
+ end
63
+
64
+ if @type && out.nil?
65
+ { name => typecast_value(out) }
66
+ else
67
+ { name => out }
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,172 @@
1
+ #encoding: UTF-8
2
+ require 'http'
3
+ require 'open-uri'
4
+ require 'nokogiri'
5
+ require 'json'
6
+ require 'nori'
7
+ require 'uri'
8
+ require 'logger'
9
+ require 'cgi'
10
+ require 'mime/types'
11
+ require 'active_support/core_ext/hash'
12
+ require 'zlib'
13
+ require 'minitar'
14
+ require 'csv'
15
+
16
+ #require_relative 'ext/xml_utility_node'
17
+ module DataCollector
18
+ class Input
19
+ attr_reader :raw
20
+
21
+ def initialize
22
+ @logger = Logger.new(STDOUT)
23
+ end
24
+
25
+ def from_uri(source, options = {})
26
+ source = CGI.unescapeHTML(source)
27
+ @logger.info("Loading #{source}")
28
+ uri = URI(source)
29
+ begin
30
+ data = nil
31
+ case uri.scheme
32
+ when 'http'
33
+ data = from_http(uri, options)
34
+ when 'https'
35
+ data = from_https(uri, options)
36
+ when 'file'
37
+ data = from_file(uri, options)
38
+ else
39
+ raise "Do not know how to process #{source}"
40
+ end
41
+
42
+ data = data.nil? ? 'no data found' : data
43
+
44
+ if block_given?
45
+ yield data
46
+ else
47
+ data
48
+ end
49
+ rescue => e
50
+ @logger.info(e.message)
51
+ puts e.backtrace.join("\n")
52
+ nil
53
+ end
54
+ end
55
+
56
+ private
57
+ def from_http(uri, options = {})
58
+ from_https(uri, options)
59
+ end
60
+
61
+ def from_https(uri, options = {})
62
+ data = nil
63
+ http = HTTP
64
+
65
+ if options.keys.include?(:user) && options.keys.include?(:password)
66
+ user = options[:user]
67
+ password = options[:password]
68
+ http = HTTP.basic_auth(user: user, pass: password)
69
+ else
70
+ @logger.warn ("User or Password parameter not found")
71
+ end
72
+
73
+ http_response = http.get(escape_uri(uri))
74
+
75
+ case http_response.code
76
+ when 200
77
+ @raw = data = http_response.body.to_s
78
+
79
+ # File.open("#{rand(1000)}.xml", 'wb') do |f|
80
+ # f.puts data
81
+ # end
82
+
83
+ file_type = file_type_from(http_response.headers)
84
+
85
+ unless options.with_indifferent_access.has_key?(:raw) && options.with_indifferent_access[:raw] == true
86
+ case file_type
87
+ when 'application/json'
88
+ data = JSON.parse(data)
89
+ when 'application/atom+xml'
90
+ data = xml_to_hash(data)
91
+ when 'text/csv'
92
+ data = csv_to_hash(data)
93
+ when 'application/xml'
94
+ when 'text/xml'
95
+ data = xml_to_hash(data)
96
+ else
97
+ data = xml_to_hash(data)
98
+ end
99
+ end
100
+ when 401
101
+ raise 'Unauthorized'
102
+ when 404
103
+ raise 'Not found'
104
+ else
105
+ raise "Unable to process received status code = #{http_response.code}"
106
+ end
107
+
108
+ data
109
+ end
110
+
111
+ def from_file(uri, options = {})
112
+ data = nil
113
+ absolute_path = File.absolute_path("#{uri.host}#{uri.path}")
114
+ unless options.has_key?('raw') && options['raw'] == true
115
+ @raw = data = File.read("#{absolute_path}")
116
+ case File.extname(absolute_path)
117
+ when '.json'
118
+ data = JSON.parse(data)
119
+ when '.xml'
120
+ data = xml_to_hash(data)
121
+ when '.gz'
122
+ Minitar.open(Zlib::GzipReader.new(File.open("#{absolute_path}", 'rb'))) do |i|
123
+ i.each do |entry|
124
+ data = entry.read
125
+ end
126
+ end
127
+ data = xml_to_hash(data)
128
+ when '.csv'
129
+ data = csv_to_hash(data)
130
+ else
131
+ raise "Do not know how to process #{uri.to_s}"
132
+ end
133
+ end
134
+
135
+ data
136
+ end
137
+
138
+ private
139
+ def xml_to_hash(data)
140
+ #gsub('&lt;\/', '&lt; /') outherwise wrong XML-parsing (see records lirias1729192 )
141
+ data = data.gsub /&lt;/, '&lt; /'
142
+ nori = Nori.new(parser: :nokogiri, strip_namespaces: true, convert_tags_to: lambda {|tag| tag.gsub(/^@/, '_')})
143
+ nori.parse(data)
144
+ #JSON.parse(nori.parse(data).to_json)
145
+ end
146
+
147
+ def csv_to_hash(data)
148
+ csv = CSV.parse(data, headers: true, header_converters: [:downcase, :symbol])
149
+
150
+ csv.collect do |record|
151
+ record.to_hash
152
+ end
153
+ end
154
+
155
+ def escape_uri(uri)
156
+ #"#{uri.to_s.gsub(uri.query, '')}#{CGI.escape(CGI.unescape(uri.query))}"
157
+ uri.to_s
158
+ end
159
+
160
+ def file_type_from(headers)
161
+ file_type = 'application/octet-stream'
162
+ file_type = if headers.include?('Content-Type')
163
+ headers['Content-Type'].split(';').first
164
+ else
165
+ MIME::Types.of(filename_from(headers)).first.content_type
166
+ end
167
+
168
+ return file_type
169
+ end
170
+
171
+ end
172
+ end
@@ -0,0 +1,203 @@
1
+ #encoding: UTF-8
2
+ require 'nokogiri'
3
+ require 'erb'
4
+ require 'date'
5
+ require 'minitar'
6
+ require 'zlib'
7
+ require 'cgi'
8
+ require 'active_support/core_ext/hash'
9
+ require 'fileutils'
10
+
11
+ module DataCollector
12
+ class Output
13
+ include Enumerable
14
+ attr_reader :data, :tar_file
15
+
16
+ def initialize(data = {})
17
+ @data = data
18
+ @logger = Logger.new(STDOUT)
19
+ end
20
+
21
+ def each
22
+ @data.each do |d|
23
+ yield d
24
+ end
25
+ end
26
+
27
+ def [](k, v = nil)
28
+ data[k]
29
+ end
30
+
31
+ def []=(k, v = nil)
32
+ unless v.nil?
33
+ if data.has_key?(k)
34
+ if data[k].is_a?(Array) then
35
+ data[k] << v
36
+ else
37
+ t = data[k]
38
+ data[k] = Array.new([t, v])
39
+ end
40
+ else
41
+ data[k] = v
42
+ end
43
+ end
44
+
45
+ data
46
+ end
47
+
48
+ def <<(input_data)
49
+ if input_data.is_a?(Hash)
50
+ input_data.each do |k, v|
51
+ self[k] = input_data[k]
52
+ end
53
+ elsif input_data.is_a?(Array)
54
+ data["datap"] = [] unless @data.has_key?("datap")
55
+ d = @data["datap"].flatten.compact
56
+ d += input_data
57
+ @data["datap"] = d.compact.flatten
58
+ end
59
+ end
60
+
61
+ def raw
62
+ @data
63
+ end
64
+
65
+ def clear
66
+ @data = {}
67
+ GC.start(full_mark: true, immediate_sweep: true)
68
+ end
69
+
70
+
71
+ def to_s(erb_file)
72
+ data = @data
73
+
74
+ def print(data, symbol, to_symbol = nil)
75
+ tag = to_symbol ? to_symbol.to_s : symbol.to_s
76
+
77
+ if data.with_indifferent_access[symbol]
78
+ if data.with_indifferent_access[symbol].is_a?(Array)
79
+ r = []
80
+ data.with_indifferent_access[symbol].each do |d|
81
+ r << "<#{tag}>#{CGI.escapeHTML(d.to_s)}</#{tag}>"
82
+ end
83
+ r.join("\n")
84
+ elsif data.with_indifferent_access[symbol].is_a?(Hash)
85
+ r = []
86
+ r << "<#{tag}>"
87
+ data.with_indifferent_access[symbol].keys.each do |k|
88
+ r << print(data.with_indifferent_access[symbol], k)
89
+ end
90
+ r << "</#{tag}>"
91
+ r.join("\n")
92
+ else
93
+ "<#{tag}>#{CGI.escapeHTML(data.with_indifferent_access[symbol].to_s)}</#{tag}>"
94
+ end
95
+ else
96
+ nil
97
+ end
98
+ rescue Exception => e
99
+ @logger.error("unable to print data '#{symbol}'")
100
+ end
101
+
102
+ def no_tag_print(data, symbol)
103
+ if data.with_indifferent_access[symbol]
104
+ if data.with_indifferent_access[symbol].is_a?(Array)
105
+ r = []
106
+ data.with_indifferent_access[symbol].each do |d|
107
+ r << "#{CGI.escapeHTML(d.to_s)}"
108
+ end
109
+ r.join(",\n")
110
+ else
111
+ "#{CGI.escapeHTML(data.with_indifferent_access[symbol].to_s)}"
112
+ end
113
+ else
114
+ nil
115
+ end
116
+ rescue Exception => e
117
+ @logger.error("unable to print (without tag) data '#{symbol}'")
118
+ end
119
+
120
+ data[:response_date] = DateTime.now.xmlschema
121
+
122
+ result = ERB.new(File.read(erb_file), 0, '>').result(binding)
123
+
124
+ result
125
+ rescue Exception => e
126
+ raise "unable to transform to text: #{e.message}"
127
+ ""
128
+ end
129
+
130
+ def to_tmp_file(erb_file, records_dir)
131
+ id = data[:id].first rescue 'unknown'
132
+ result = to_s(erb_file)
133
+ xml_result = Nokogiri::XML(result, nil, 'UTF-8') do |config|
134
+ config.noblanks
135
+ end
136
+
137
+ unless File.directory?(records_dir)
138
+ FileUtils.mkdir_p(records_dir)
139
+ end
140
+
141
+ file_name = "#{records_dir}/#{id}_#{rand(1000)}.xml"
142
+
143
+ File.open(file_name, 'wb:UTF-8') do |f|
144
+ f.puts xml_result.to_xml
145
+ end
146
+ return file_name
147
+ end
148
+
149
+ def to_file(erb_file, tar_file_name = nil)
150
+ id = data[:id].first rescue 'unknown'
151
+ result = to_s(erb_file)
152
+
153
+ xml_result = Nokogiri::XML(result, nil, 'UTF-8') do |config|
154
+ config.noblanks
155
+ end
156
+
157
+ if tar_file_name.nil?
158
+ file_name = "records/#{id}_#{rand(1000)}.xml"
159
+ File.open(file_name, 'wb:UTF-8') do |f|
160
+ f.puts xml_result.to_xml
161
+ end
162
+
163
+ return file_name
164
+ else
165
+
166
+ Minitar::Output.open(Zlib::GzipWriter.new(File.open("records/#{tar_file_name}", 'wb:UTF-8'))) do |f|
167
+ xml_data = xml_result.to_xml
168
+ f.tar.add_file_simple("#{id}_#{rand(1000)}.xml", data: xml_data, size: xml_data.size, mtime: Time.now.to_i)
169
+ end
170
+
171
+ return tar_file_name
172
+ end
173
+
174
+ rescue Exception => e
175
+ raise "unable to save to file: #{e.message}"
176
+ end
177
+
178
+
179
+ def to_jsonfile (jsondata, jsonfile)
180
+ file_name = "records/#{jsonfile}_#{Time.now.to_i}_#{rand(1000)}.json"
181
+ File.open(file_name, 'wb') do |f|
182
+ f.puts jsondata.to_json
183
+ end
184
+ rescue Exception => e
185
+ raise "unable to save to jsonfile: #{e.message}"
186
+ end
187
+
188
+ def flatten()
189
+ out = Hash.new
190
+ @data.each do |m|
191
+ out[m[0]] = m[1]
192
+ end
193
+ out
194
+ end
195
+
196
+
197
+ private
198
+
199
+ def tar_file(tar_file_name)
200
+ @tar_file ||= Minitar::Output.open(File.open("records/#{tar_file_name}", "a+b"))
201
+ end
202
+ end
203
+ end
@@ -0,0 +1,39 @@
1
+ # encoding: utf-8
2
+ require 'logger'
3
+ require_relative 'core'
4
+
5
+ module DataCollector
6
+ class Runner
7
+ def initialize(logger = Logger.new(STDOUT))
8
+ Encoding.default_external = 'UTF-8'
9
+ @logger = logger
10
+ end
11
+
12
+ def run(rule_file_name = nil)
13
+ @time_start = Time.now
14
+ prg = self
15
+ if block_given?
16
+ a = Class.new do
17
+ include DataCollector::Core
18
+ end.new
19
+
20
+ yield a
21
+ elsif !rule_file_name.nil?
22
+ prg.instance_eval(File.read(rule_file_name))
23
+ else
24
+ @logger.error('Please supply a block or file')
25
+ end
26
+
27
+ prg
28
+ rescue Error => e
29
+ puts e.message
30
+ puts e.backtrace.join("\n")
31
+ ensure
32
+ # output.tar_file.close unless output.tar_file.closed?
33
+ @logger.info("Finished in #{((Time.now - @time_start)*1000).to_i} ms")
34
+ end
35
+
36
+ private
37
+ include DataCollector::Core
38
+ end
39
+ end
@@ -0,0 +1,4 @@
1
+ # encoding: utf-8
2
+ module DataCollector
3
+ VERSION = "0.1.1"
4
+ end
metadata ADDED
@@ -0,0 +1,231 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: data_collector
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Mehmet Celik
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2019-07-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.10'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.10'
27
+ - !ruby/object:Gem::Dependency
28
+ name: json
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.2'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.2'
41
+ - !ruby/object:Gem::Dependency
42
+ name: jsonpath
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: nori
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.6'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '2.6'
69
+ - !ruby/object:Gem::Dependency
70
+ name: http
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '4.1'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '4.1'
83
+ - !ruby/object:Gem::Dependency
84
+ name: mime-types
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '3.2'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '3.2'
97
+ - !ruby/object:Gem::Dependency
98
+ name: minitar
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '='
102
+ - !ruby/object:Gem::Version
103
+ version: '0.8'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '='
109
+ - !ruby/object:Gem::Version
110
+ version: '0.8'
111
+ - !ruby/object:Gem::Dependency
112
+ name: activesupport
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '5.2'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '5.2'
125
+ - !ruby/object:Gem::Dependency
126
+ name: redis
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '4.1'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '4.1'
139
+ - !ruby/object:Gem::Dependency
140
+ name: bundler
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '2.0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '2.0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: rake
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '10.0'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - "~>"
165
+ - !ruby/object:Gem::Version
166
+ version: '10.0'
167
+ - !ruby/object:Gem::Dependency
168
+ name: minitest
169
+ requirement: !ruby/object:Gem::Requirement
170
+ requirements:
171
+ - - "~>"
172
+ - !ruby/object:Gem::Version
173
+ version: '5.0'
174
+ type: :development
175
+ prerelease: false
176
+ version_requirements: !ruby/object:Gem::Requirement
177
+ requirements:
178
+ - - "~>"
179
+ - !ruby/object:Gem::Version
180
+ version: '5.0'
181
+ description: INPUT, FILTER, OUTPUT data
182
+ email:
183
+ - mehmet@celik.be
184
+ executables: []
185
+ extensions: []
186
+ extra_rdoc_files: []
187
+ files:
188
+ - ".gitignore"
189
+ - ".travis.yml"
190
+ - Gemfile
191
+ - LICENSE.txt
192
+ - README.md
193
+ - Rakefile
194
+ - bin/console
195
+ - bin/setup
196
+ - data_collector.gemspec
197
+ - lib/data_collector.rb
198
+ - lib/data_collector/config_file.rb
199
+ - lib/data_collector/core.rb
200
+ - lib/data_collector/ext/xml_utility_node.rb
201
+ - lib/data_collector/input.rb
202
+ - lib/data_collector/output.rb
203
+ - lib/data_collector/runner.rb
204
+ - lib/data_collector/version.rb
205
+ homepage: https://github.com/mehmetc/data_collector
206
+ licenses:
207
+ - MIT
208
+ metadata:
209
+ homepage_uri: https://github.com/mehmetc/data_collector
210
+ source_code_uri: https://github.com/mehmetc/data_collect
211
+ changelog_uri: https://github.com/mehmetc/data_collect
212
+ post_install_message:
213
+ rdoc_options: []
214
+ require_paths:
215
+ - lib
216
+ required_ruby_version: !ruby/object:Gem::Requirement
217
+ requirements:
218
+ - - ">="
219
+ - !ruby/object:Gem::Version
220
+ version: '0'
221
+ required_rubygems_version: !ruby/object:Gem::Requirement
222
+ requirements:
223
+ - - ">="
224
+ - !ruby/object:Gem::Version
225
+ version: '0'
226
+ requirements: []
227
+ rubygems_version: 3.0.2
228
+ signing_key:
229
+ specification_version: 4
230
+ summary: ETL library
231
+ test_files: []