data_collector 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 67dc86adcebab3a6deba3ca2adf6c41c5925c03fc0dca627b030a39ff889d796
4
- data.tar.gz: a4283972b9035c9e48370bcd07f3a375fb5312de584ed757f3572cc4c0b47164
3
+ metadata.gz: bc7b7aa820d36ad017732ae695c841013cce3a826e9042e19a2995442847a39b
4
+ data.tar.gz: 6e2e5012d09ea7cf08c967093242f2e5af7389e62cd3d631b213ba948b019b8a
5
5
  SHA512:
6
- metadata.gz: 65fee9ff5f15c521afc0f8181a67cebe5d7d3668a72e5cf792572745b9f7279565972cfea1568fed08d1173b33686c7fd9038c244afbc37891d25a25b6d3962d
7
- data.tar.gz: 5cb4b0144a4993a00af0fc9d9637f9bc1c1fb0b72fd082e30c9f7ffbd319fbd2ad28ce95ae9b0fe54020bcb0e3ec0251380830ff4525d7a80c33966355a1ac0a
6
+ metadata.gz: fb68056b1e72c17f51de9df99487f85b01dc64fb17951419eebb7a5e35d07906f4b47093092ecf6afda5b1a415a51e12d5ad7c47fda9bae39c783b0187eb52fc
7
+ data.tar.gz: 3d3a3b35c1b011fb55bec52ea98a2291a2d3fa2d270ce68948879d4c4b1978af4f15a0bb524e5b5fa20a924252d349ea1907f25d1df99674885a23a9d4fd2a52
data/README.md CHANGED
@@ -3,12 +3,26 @@ Convinience module to Extract, Transform and Load your data.
3
3
 
4
4
  #### input
5
5
  Read input from an URI
6
+
7
+ **Public methods**
8
+ ```ruby
9
+ from_uri(source, options = {:raw, :content_type})
10
+ ```
11
+ - source: an uri with a scheme of http, https, file
12
+ - options:
13
+ - raw: _boolean_ do not parse
14
+ - content_type: _string_ force a content_type if the 'Content-Type' returned by the http server is incorrect
15
+
6
16
  example:
7
17
  ```ruby
8
18
  input.from_uri("http://www.libis.be")
9
19
  input.from_uri("file://hello.txt")
20
+ input.from_uri("http://www.libis.be/record.jsonld", content_type: 'application/ld+json')
10
21
  ```
11
22
 
23
+
24
+
25
+
12
26
  Inputs can be JSON, XML or CSV or XML in a TAR.GZ file
13
27
 
14
28
  ### output
@@ -67,19 +81,18 @@ Into a temp directory
67
81
  ```
68
82
 
69
83
  #### filter
70
- filter data from a hash using [JsonPath](http://goessner.net/articles/JsonPath/index.html)
84
+ filter data from a hash using [JSONPath](http://goessner.net/articles/JsonPath/index.html)
71
85
 
72
86
  ```ruby
73
87
  filtered_data = filter(data, "$..metadata.record")
74
88
  ```
75
89
 
76
90
  #### rules
77
-
78
- rules allows you to define a simple structure to run against a JSONPath filter
91
+ Allows you to define a simple lambda structure to run against a JSONPath filter
79
92
 
80
93
  A rule is made up of a Hash the key is the map key field its value is a Hash with a JSONPath filter and options to apply a convert method on the filtered results.
81
94
  Available convert methods are: time, map, each, call, suffix
82
- - time: Parses a given time/date string into a Time object
95
+ - time: parses a given time/date string into a Time object
83
96
  - map: applies a mapping to a filter
84
97
  - suffix: adds a suffix to a result
85
98
  - call: executes a lambda on the filter
@@ -9,8 +9,8 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ["Mehmet Celik"]
10
10
  spec.email = ["mehmet@celik.be"]
11
11
 
12
- spec.summary = %q{ETL library}
13
- spec.description = %q{INPUT, FILTER, OUTPUT data}
12
+ spec.summary = %q{ETL helper library}
13
+ spec.description = %q{INPUT, FILTER, OUTPUT data with RULES and code}
14
14
  spec.homepage = "https://github.com/mehmetc/data_collector"
15
15
  spec.license = "MIT"
16
16
 
@@ -44,6 +44,7 @@ Gem::Specification.new do |spec|
44
44
  spec.add_runtime_dependency "mime-types", "~> 3.2"
45
45
  spec.add_runtime_dependency "minitar", "= 0.9"
46
46
  spec.add_runtime_dependency "activesupport", "~> 5.2"
47
+ spec.add_runtime_dependency "json-ld", "~> 3.1"
47
48
 
48
49
  spec.add_development_dependency "bundler", "~> 2.0"
49
50
  spec.add_development_dependency "rake", "~> 10.0"
@@ -2,7 +2,7 @@
2
2
  require 'http'
3
3
  require 'open-uri'
4
4
  require 'nokogiri'
5
- require 'json'
5
+ require 'json/ld'
6
6
  require 'nori'
7
7
  require 'uri'
8
8
  require 'logger'
@@ -15,20 +15,20 @@ require 'csv'
15
15
 
16
16
  #require_relative 'ext/xml_utility_node'
17
17
  module DataCollector
18
- class Input
19
- attr_reader :raw
18
+ class Input
19
+ attr_reader :raw
20
20
 
21
- def initialize
22
- @logger = Logger.new(STDOUT)
23
- end
21
+ def initialize
22
+ @logger = Logger.new(STDOUT)
23
+ end
24
24
 
25
- def from_uri(source, options = {})
26
- source = CGI.unescapeHTML(source)
27
- @logger.info("Loading #{source}")
28
- uri = URI(source)
29
- begin
30
- data = nil
31
- case uri.scheme
25
+ def from_uri(source, options = {})
26
+ source = CGI.unescapeHTML(source)
27
+ @logger.info("Loading #{source}")
28
+ uri = URI(source)
29
+ begin
30
+ data = nil
31
+ case uri.scheme
32
32
  when 'http'
33
33
  data = from_http(uri, options)
34
34
  when 'https'
@@ -37,42 +37,43 @@ class Input
37
37
  data = from_file(uri, options)
38
38
  else
39
39
  raise "Do not know how to process #{source}"
40
- end
40
+ end
41
41
 
42
- data = data.nil? ? 'no data found' : data
42
+ data = data.nil? ? 'no data found' : data
43
43
 
44
- if block_given?
45
- yield data
46
- else
47
- data
44
+ if block_given?
45
+ yield data
46
+ else
47
+ data
48
+ end
49
+ rescue => e
50
+ @logger.info(e.message)
51
+ puts e.backtrace.join("\n")
52
+ nil
48
53
  end
49
- rescue => e
50
- @logger.info(e.message)
51
- puts e.backtrace.join("\n")
52
- nil
53
54
  end
54
- end
55
55
 
56
- private
57
- def from_http(uri, options = {})
58
- from_https(uri, options)
59
- end
60
-
61
- def from_https(uri, options = {})
62
- data = nil
63
- http = HTTP
56
+ private
64
57
 
65
- if options.keys.include?(:user) && options.keys.include?(:password)
66
- user = options[:user]
67
- password = options[:password]
68
- http = HTTP.basic_auth(user: user, pass: password)
69
- else
70
- @logger.warn ("User or Password parameter not found")
58
+ def from_http(uri, options = {})
59
+ from_https(uri, options)
71
60
  end
72
61
 
73
- http_response = http.get(escape_uri(uri))
62
+ def from_https(uri, options = {})
63
+ data = nil
64
+ http = HTTP
65
+
66
+ if options.keys.include?(:user) && options.keys.include?(:password)
67
+ user = options[:user]
68
+ password = options[:password]
69
+ http = HTTP.basic_auth(user: user, pass: password)
70
+ else
71
+ @logger.warn ("User or Password parameter not found")
72
+ end
74
73
 
75
- case http_response.code
74
+ http_response = http.get(escape_uri(uri))
75
+
76
+ case http_response.code
76
77
  when 200
77
78
  @raw = data = http_response.body.to_s
78
79
 
@@ -80,21 +81,24 @@ class Input
80
81
  # f.puts data
81
82
  # end
82
83
 
83
- file_type = file_type_from(http_response.headers)
84
+ file_type = options.with_indifferent_access.has_key?(:content_type) ? options.with_indifferent_access[:content_type] : file_type_from(http_response.headers)
84
85
 
85
86
  unless options.with_indifferent_access.has_key?(:raw) && options.with_indifferent_access[:raw] == true
86
87
  case file_type
87
- when 'application/json'
88
- data = JSON.parse(data)
89
- when 'application/atom+xml'
90
- data = xml_to_hash(data)
91
- when 'text/csv'
92
- data = csv_to_hash(data)
93
- when 'application/xml'
94
- when 'text/xml'
95
- data = xml_to_hash(data)
96
- else
97
- data = xml_to_hash(data)
88
+ when 'application/ld+json'
89
+ data = JSON.parse(data)
90
+ when 'application/json'
91
+ data = JSON.parse(data)
92
+ when 'application/atom+xml'
93
+ data = xml_to_hash(data)
94
+ when 'text/csv'
95
+ data = csv_to_hash(data)
96
+ when 'application/xml'
97
+ data = xml_to_hash(data)
98
+ when 'text/xml'
99
+ data = xml_to_hash(data)
100
+ else
101
+ data = xml_to_hash(data)
98
102
  end
99
103
  end
100
104
  when 401
@@ -103,17 +107,19 @@ class Input
103
107
  raise 'Not found'
104
108
  else
105
109
  raise "Unable to process received status code = #{http_response.code}"
106
- end
110
+ end
107
111
 
108
- data
109
- end
112
+ data
113
+ end
110
114
 
111
- def from_file(uri, options = {})
112
- data = nil
113
- absolute_path = File.absolute_path("#{uri.host}#{uri.path}")
114
- unless options.has_key?('raw') && options['raw'] == true
115
- @raw = data = File.read("#{absolute_path}")
116
- case File.extname(absolute_path)
115
+ def from_file(uri, options = {})
116
+ data = nil
117
+ absolute_path = File.absolute_path("#{uri.host}#{uri.path}")
118
+ unless options.has_key?('raw') && options['raw'] == true
119
+ @raw = data = File.read("#{absolute_path}")
120
+ case File.extname(absolute_path)
121
+ when '.jsonld'
122
+ data = JSON.parse(data)
117
123
  when '.json'
118
124
  data = JSON.parse(data)
119
125
  when '.xml'
@@ -129,44 +135,43 @@ class Input
129
135
  data = csv_to_hash(data)
130
136
  else
131
137
  raise "Do not know how to process #{uri.to_s}"
138
+ end
132
139
  end
140
+
141
+ data
133
142
  end
134
143
 
135
- data
136
- end
144
+ def xml_to_hash(data)
145
+ #gsub('<\/', '< /') outherwise wrong XML-parsing (see records lirias1729192 )
146
+ data = data.gsub /</, '< /'
147
+ nori = Nori.new(parser: :nokogiri, strip_namespaces: true, convert_tags_to: lambda { |tag| tag.gsub(/^@/, '_') })
148
+ nori.parse(data)
149
+ #JSON.parse(nori.parse(data).to_json)
150
+ end
137
151
 
138
- private
139
- def xml_to_hash(data)
140
- #gsub('<\/', '< /') outherwise wrong XML-parsing (see records lirias1729192 )
141
- data = data.gsub /</, '< /'
142
- nori = Nori.new(parser: :nokogiri, strip_namespaces: true, convert_tags_to: lambda {|tag| tag.gsub(/^@/, '_')})
143
- nori.parse(data)
144
- #JSON.parse(nori.parse(data).to_json)
145
- end
152
+ def csv_to_hash(data)
153
+ csv = CSV.parse(data, headers: true, header_converters: [:downcase, :symbol])
146
154
 
147
- def csv_to_hash(data)
148
- csv = CSV.parse(data, headers: true, header_converters: [:downcase, :symbol])
155
+ csv.collect do |record|
156
+ record.to_hash
157
+ end
158
+ end
149
159
 
150
- csv.collect do |record|
151
- record.to_hash
160
+ def escape_uri(uri)
161
+ #"#{uri.to_s.gsub(uri.query, '')}#{CGI.escape(CGI.unescape(uri.query))}"
162
+ uri.to_s
152
163
  end
153
- end
154
164
 
155
- def escape_uri(uri)
156
- #"#{uri.to_s.gsub(uri.query, '')}#{CGI.escape(CGI.unescape(uri.query))}"
157
- uri.to_s
158
- end
165
+ def file_type_from(headers)
166
+ file_type = 'application/octet-stream'
167
+ file_type = if headers.include?('Content-Type')
168
+ headers['Content-Type'].split(';').first
169
+ else
170
+ MIME::Types.of(filename_from(headers)).first.content_type
171
+ end
159
172
 
160
- def file_type_from(headers)
161
- file_type = 'application/octet-stream'
162
- file_type = if headers.include?('Content-Type')
163
- headers['Content-Type'].split(';').first
164
- else
165
- MIME::Types.of(filename_from(headers)).first.content_type
166
- end
173
+ return file_type
174
+ end
167
175
 
168
- return file_type
169
176
  end
170
-
171
- end
172
177
  end
@@ -1,4 +1,4 @@
1
1
  # encoding: utf-8
2
2
  module DataCollector
3
- VERSION = "0.2.2"
3
+ VERSION = "0.2.3"
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_collector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mehmet Celik
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-12-02 00:00:00.000000000 Z
11
+ date: 2020-01-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -122,6 +122,20 @@ dependencies:
122
122
  - - "~>"
123
123
  - !ruby/object:Gem::Version
124
124
  version: '5.2'
125
+ - !ruby/object:Gem::Dependency
126
+ name: json-ld
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: '3.1'
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: '3.1'
125
139
  - !ruby/object:Gem::Dependency
126
140
  name: bundler
127
141
  requirement: !ruby/object:Gem::Requirement
@@ -164,7 +178,7 @@ dependencies:
164
178
  - - "~>"
165
179
  - !ruby/object:Gem::Version
166
180
  version: '5.0'
167
- description: INPUT, FILTER, OUTPUT data
181
+ description: INPUT, FILTER, OUTPUT data with RULES and code
168
182
  email:
169
183
  - mehmet@celik.be
170
184
  executables: []
@@ -214,5 +228,5 @@ requirements: []
214
228
  rubygems_version: 3.0.2
215
229
  signing_key:
216
230
  specification_version: 4
217
- summary: ETL library
231
+ summary: ETL helper library
218
232
  test_files: []