data_collector 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 67dc86adcebab3a6deba3ca2adf6c41c5925c03fc0dca627b030a39ff889d796
4
- data.tar.gz: a4283972b9035c9e48370bcd07f3a375fb5312de584ed757f3572cc4c0b47164
3
+ metadata.gz: bc7b7aa820d36ad017732ae695c841013cce3a826e9042e19a2995442847a39b
4
+ data.tar.gz: 6e2e5012d09ea7cf08c967093242f2e5af7389e62cd3d631b213ba948b019b8a
5
5
  SHA512:
6
- metadata.gz: 65fee9ff5f15c521afc0f8181a67cebe5d7d3668a72e5cf792572745b9f7279565972cfea1568fed08d1173b33686c7fd9038c244afbc37891d25a25b6d3962d
7
- data.tar.gz: 5cb4b0144a4993a00af0fc9d9637f9bc1c1fb0b72fd082e30c9f7ffbd319fbd2ad28ce95ae9b0fe54020bcb0e3ec0251380830ff4525d7a80c33966355a1ac0a
6
+ metadata.gz: fb68056b1e72c17f51de9df99487f85b01dc64fb17951419eebb7a5e35d07906f4b47093092ecf6afda5b1a415a51e12d5ad7c47fda9bae39c783b0187eb52fc
7
+ data.tar.gz: 3d3a3b35c1b011fb55bec52ea98a2291a2d3fa2d270ce68948879d4c4b1978af4f15a0bb524e5b5fa20a924252d349ea1907f25d1df99674885a23a9d4fd2a52
data/README.md CHANGED
@@ -3,12 +3,26 @@ Convinience module to Extract, Transform and Load your data.
3
3
 
4
4
  #### input
5
5
  Read input from an URI
6
+
7
+ **Public methods**
8
+ ```ruby
9
+ from_uri(source, options = {:raw, :content_type})
10
+ ```
11
+ - source: an uri with a scheme of http, https, file
12
+ - options:
13
+ - raw: _boolean_ do not parse
14
+ - content_type: _string_ force a content_type if the 'Content-Type' returned by the http server is incorrect
15
+
6
16
  example:
7
17
  ```ruby
8
18
  input.from_uri("http://www.libis.be")
9
19
  input.from_uri("file://hello.txt")
20
+ input.from_uri("http://www.libis.be/record.jsonld", content_type: 'application/ld+json')
10
21
  ```
11
22
 
23
+
24
+
25
+
12
26
  Inputs can be JSON, XML or CSV or XML in a TAR.GZ file
13
27
 
14
28
  ### output
@@ -67,19 +81,18 @@ Into a temp directory
67
81
  ```
68
82
 
69
83
  #### filter
70
- filter data from a hash using [JsonPath](http://goessner.net/articles/JsonPath/index.html)
84
+ filter data from a hash using [JSONPath](http://goessner.net/articles/JsonPath/index.html)
71
85
 
72
86
  ```ruby
73
87
  filtered_data = filter(data, "$..metadata.record")
74
88
  ```
75
89
 
76
90
  #### rules
77
-
78
- rules allows you to define a simple structure to run against a JSONPath filter
91
+ Allows you to define a simple lambda structure to run against a JSONPath filter
79
92
 
80
93
  A rule is made up of a Hash the key is the map key field its value is a Hash with a JSONPath filter and options to apply a convert method on the filtered results.
81
94
  Available convert methods are: time, map, each, call, suffix
82
- - time: Parses a given time/date string into a Time object
95
+ - time: parses a given time/date string into a Time object
83
96
  - map: applies a mapping to a filter
84
97
  - suffix: adds a suffix to a result
85
98
  - call: executes a lambda on the filter
@@ -9,8 +9,8 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ["Mehmet Celik"]
10
10
  spec.email = ["mehmet@celik.be"]
11
11
 
12
- spec.summary = %q{ETL library}
13
- spec.description = %q{INPUT, FILTER, OUTPUT data}
12
+ spec.summary = %q{ETL helper library}
13
+ spec.description = %q{INPUT, FILTER, OUTPUT data with RULES and code}
14
14
  spec.homepage = "https://github.com/mehmetc/data_collector"
15
15
  spec.license = "MIT"
16
16
 
@@ -44,6 +44,7 @@ Gem::Specification.new do |spec|
44
44
  spec.add_runtime_dependency "mime-types", "~> 3.2"
45
45
  spec.add_runtime_dependency "minitar", "= 0.9"
46
46
  spec.add_runtime_dependency "activesupport", "~> 5.2"
47
+ spec.add_runtime_dependency "json-ld", "~> 3.1"
47
48
 
48
49
  spec.add_development_dependency "bundler", "~> 2.0"
49
50
  spec.add_development_dependency "rake", "~> 10.0"
@@ -2,7 +2,7 @@
2
2
  require 'http'
3
3
  require 'open-uri'
4
4
  require 'nokogiri'
5
- require 'json'
5
+ require 'json/ld'
6
6
  require 'nori'
7
7
  require 'uri'
8
8
  require 'logger'
@@ -15,20 +15,20 @@ require 'csv'
15
15
 
16
16
  #require_relative 'ext/xml_utility_node'
17
17
  module DataCollector
18
- class Input
19
- attr_reader :raw
18
+ class Input
19
+ attr_reader :raw
20
20
 
21
- def initialize
22
- @logger = Logger.new(STDOUT)
23
- end
21
+ def initialize
22
+ @logger = Logger.new(STDOUT)
23
+ end
24
24
 
25
- def from_uri(source, options = {})
26
- source = CGI.unescapeHTML(source)
27
- @logger.info("Loading #{source}")
28
- uri = URI(source)
29
- begin
30
- data = nil
31
- case uri.scheme
25
+ def from_uri(source, options = {})
26
+ source = CGI.unescapeHTML(source)
27
+ @logger.info("Loading #{source}")
28
+ uri = URI(source)
29
+ begin
30
+ data = nil
31
+ case uri.scheme
32
32
  when 'http'
33
33
  data = from_http(uri, options)
34
34
  when 'https'
@@ -37,42 +37,43 @@ class Input
37
37
  data = from_file(uri, options)
38
38
  else
39
39
  raise "Do not know how to process #{source}"
40
- end
40
+ end
41
41
 
42
- data = data.nil? ? 'no data found' : data
42
+ data = data.nil? ? 'no data found' : data
43
43
 
44
- if block_given?
45
- yield data
46
- else
47
- data
44
+ if block_given?
45
+ yield data
46
+ else
47
+ data
48
+ end
49
+ rescue => e
50
+ @logger.info(e.message)
51
+ puts e.backtrace.join("\n")
52
+ nil
48
53
  end
49
- rescue => e
50
- @logger.info(e.message)
51
- puts e.backtrace.join("\n")
52
- nil
53
54
  end
54
- end
55
55
 
56
- private
57
- def from_http(uri, options = {})
58
- from_https(uri, options)
59
- end
60
-
61
- def from_https(uri, options = {})
62
- data = nil
63
- http = HTTP
56
+ private
64
57
 
65
- if options.keys.include?(:user) && options.keys.include?(:password)
66
- user = options[:user]
67
- password = options[:password]
68
- http = HTTP.basic_auth(user: user, pass: password)
69
- else
70
- @logger.warn ("User or Password parameter not found")
58
+ def from_http(uri, options = {})
59
+ from_https(uri, options)
71
60
  end
72
61
 
73
- http_response = http.get(escape_uri(uri))
62
+ def from_https(uri, options = {})
63
+ data = nil
64
+ http = HTTP
65
+
66
+ if options.keys.include?(:user) && options.keys.include?(:password)
67
+ user = options[:user]
68
+ password = options[:password]
69
+ http = HTTP.basic_auth(user: user, pass: password)
70
+ else
71
+ @logger.warn ("User or Password parameter not found")
72
+ end
74
73
 
75
- case http_response.code
74
+ http_response = http.get(escape_uri(uri))
75
+
76
+ case http_response.code
76
77
  when 200
77
78
  @raw = data = http_response.body.to_s
78
79
 
@@ -80,21 +81,24 @@ class Input
80
81
  # f.puts data
81
82
  # end
82
83
 
83
- file_type = file_type_from(http_response.headers)
84
+ file_type = options.with_indifferent_access.has_key?(:content_type) ? options.with_indifferent_access[:content_type] : file_type_from(http_response.headers)
84
85
 
85
86
  unless options.with_indifferent_access.has_key?(:raw) && options.with_indifferent_access[:raw] == true
86
87
  case file_type
87
- when 'application/json'
88
- data = JSON.parse(data)
89
- when 'application/atom+xml'
90
- data = xml_to_hash(data)
91
- when 'text/csv'
92
- data = csv_to_hash(data)
93
- when 'application/xml'
94
- when 'text/xml'
95
- data = xml_to_hash(data)
96
- else
97
- data = xml_to_hash(data)
88
+ when 'application/ld+json'
89
+ data = JSON.parse(data)
90
+ when 'application/json'
91
+ data = JSON.parse(data)
92
+ when 'application/atom+xml'
93
+ data = xml_to_hash(data)
94
+ when 'text/csv'
95
+ data = csv_to_hash(data)
96
+ when 'application/xml'
97
+ data = xml_to_hash(data)
98
+ when 'text/xml'
99
+ data = xml_to_hash(data)
100
+ else
101
+ data = xml_to_hash(data)
98
102
  end
99
103
  end
100
104
  when 401
@@ -103,17 +107,19 @@ class Input
103
107
  raise 'Not found'
104
108
  else
105
109
  raise "Unable to process received status code = #{http_response.code}"
106
- end
110
+ end
107
111
 
108
- data
109
- end
112
+ data
113
+ end
110
114
 
111
- def from_file(uri, options = {})
112
- data = nil
113
- absolute_path = File.absolute_path("#{uri.host}#{uri.path}")
114
- unless options.has_key?('raw') && options['raw'] == true
115
- @raw = data = File.read("#{absolute_path}")
116
- case File.extname(absolute_path)
115
+ def from_file(uri, options = {})
116
+ data = nil
117
+ absolute_path = File.absolute_path("#{uri.host}#{uri.path}")
118
+ unless options.has_key?('raw') && options['raw'] == true
119
+ @raw = data = File.read("#{absolute_path}")
120
+ case File.extname(absolute_path)
121
+ when '.jsonld'
122
+ data = JSON.parse(data)
117
123
  when '.json'
118
124
  data = JSON.parse(data)
119
125
  when '.xml'
@@ -129,44 +135,43 @@ class Input
129
135
  data = csv_to_hash(data)
130
136
  else
131
137
  raise "Do not know how to process #{uri.to_s}"
138
+ end
132
139
  end
140
+
141
+ data
133
142
  end
134
143
 
135
- data
136
- end
144
+ def xml_to_hash(data)
145
+ #gsub('<\/', '< /') outherwise wrong XML-parsing (see records lirias1729192 )
146
+ data = data.gsub /</, '< /'
147
+ nori = Nori.new(parser: :nokogiri, strip_namespaces: true, convert_tags_to: lambda { |tag| tag.gsub(/^@/, '_') })
148
+ nori.parse(data)
149
+ #JSON.parse(nori.parse(data).to_json)
150
+ end
137
151
 
138
- private
139
- def xml_to_hash(data)
140
- #gsub('<\/', '< /') outherwise wrong XML-parsing (see records lirias1729192 )
141
- data = data.gsub /</, '< /'
142
- nori = Nori.new(parser: :nokogiri, strip_namespaces: true, convert_tags_to: lambda {|tag| tag.gsub(/^@/, '_')})
143
- nori.parse(data)
144
- #JSON.parse(nori.parse(data).to_json)
145
- end
152
+ def csv_to_hash(data)
153
+ csv = CSV.parse(data, headers: true, header_converters: [:downcase, :symbol])
146
154
 
147
- def csv_to_hash(data)
148
- csv = CSV.parse(data, headers: true, header_converters: [:downcase, :symbol])
155
+ csv.collect do |record|
156
+ record.to_hash
157
+ end
158
+ end
149
159
 
150
- csv.collect do |record|
151
- record.to_hash
160
+ def escape_uri(uri)
161
+ #"#{uri.to_s.gsub(uri.query, '')}#{CGI.escape(CGI.unescape(uri.query))}"
162
+ uri.to_s
152
163
  end
153
- end
154
164
 
155
- def escape_uri(uri)
156
- #"#{uri.to_s.gsub(uri.query, '')}#{CGI.escape(CGI.unescape(uri.query))}"
157
- uri.to_s
158
- end
165
+ def file_type_from(headers)
166
+ file_type = 'application/octet-stream'
167
+ file_type = if headers.include?('Content-Type')
168
+ headers['Content-Type'].split(';').first
169
+ else
170
+ MIME::Types.of(filename_from(headers)).first.content_type
171
+ end
159
172
 
160
- def file_type_from(headers)
161
- file_type = 'application/octet-stream'
162
- file_type = if headers.include?('Content-Type')
163
- headers['Content-Type'].split(';').first
164
- else
165
- MIME::Types.of(filename_from(headers)).first.content_type
166
- end
173
+ return file_type
174
+ end
167
175
 
168
- return file_type
169
176
  end
170
-
171
- end
172
177
  end
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module DataCollector
3
- VERSION = "0.2.2"
3
+ VERSION = "0.2.3"
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_collector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mehmet Celik
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-12-02 00:00:00.000000000 Z
11
+ date: 2020-01-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -122,6 +122,20 @@ dependencies:
122
122
  - - "~>"
123
123
  - !ruby/object:Gem::Version
124
124
  version: '5.2'
125
+ - !ruby/object:Gem::Dependency
126
+ name: json-ld
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '3.1'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '3.1'
125
139
  - !ruby/object:Gem::Dependency
126
140
  name: bundler
127
141
  requirement: !ruby/object:Gem::Requirement
@@ -164,7 +178,7 @@ dependencies:
164
178
  - - "~>"
165
179
  - !ruby/object:Gem::Version
166
180
  version: '5.0'
167
- description: INPUT, FILTER, OUTPUT data
181
+ description: INPUT, FILTER, OUTPUT data with RULES and code
168
182
  email:
169
183
  - mehmet@celik.be
170
184
  executables: []
@@ -214,5 +228,5 @@ requirements: []
214
228
  rubygems_version: 3.0.2
215
229
  signing_key:
216
230
  specification_version: 4
217
- summary: ETL library
231
+ summary: ETL helper library
218
232
  test_files: []