data_collector 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +17 -4
- data/data_collector.gemspec +3 -2
- data/lib/data_collector/input.rb +95 -90
- data/lib/data_collector/version.rb +1 -1
- metadata +18 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bc7b7aa820d36ad017732ae695c841013cce3a826e9042e19a2995442847a39b
|
4
|
+
data.tar.gz: 6e2e5012d09ea7cf08c967093242f2e5af7389e62cd3d631b213ba948b019b8a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb68056b1e72c17f51de9df99487f85b01dc64fb17951419eebb7a5e35d07906f4b47093092ecf6afda5b1a415a51e12d5ad7c47fda9bae39c783b0187eb52fc
|
7
|
+
data.tar.gz: 3d3a3b35c1b011fb55bec52ea98a2291a2d3fa2d270ce68948879d4c4b1978af4f15a0bb524e5b5fa20a924252d349ea1907f25d1df99674885a23a9d4fd2a52
|
data/README.md
CHANGED
@@ -3,12 +3,26 @@ Convinience module to Extract, Transform and Load your data.
|
|
3
3
|
|
4
4
|
#### input
|
5
5
|
Read input from an URI
|
6
|
+
|
7
|
+
**Public methods**
|
8
|
+
```ruby
|
9
|
+
from_uri(source, options = {:raw, :content_type})
|
10
|
+
```
|
11
|
+
- source: an uri with a scheme of http, https, file
|
12
|
+
- options:
|
13
|
+
- raw: _boolean_ do not parse
|
14
|
+
- content_type: _string_ force a content_type if the 'Content-Type' returned by the http server is incorrect
|
15
|
+
|
6
16
|
example:
|
7
17
|
```ruby
|
8
18
|
input.from_uri("http://www.libis.be")
|
9
19
|
input.from_uri("file://hello.txt")
|
20
|
+
input.from_uri("http://www.libis.be/record.jsonld", content_type: 'application/ld+json')
|
10
21
|
```
|
11
22
|
|
23
|
+
|
24
|
+
|
25
|
+
|
12
26
|
Inputs can be JSON, XML or CSV or XML in a TAR.GZ file
|
13
27
|
|
14
28
|
### output
|
@@ -67,19 +81,18 @@ Into a temp directory
|
|
67
81
|
```
|
68
82
|
|
69
83
|
#### filter
|
70
|
-
filter data from a hash using [
|
84
|
+
filter data from a hash using [JSONPath](http://goessner.net/articles/JsonPath/index.html)
|
71
85
|
|
72
86
|
```ruby
|
73
87
|
filtered_data = filter(data, "$..metadata.record")
|
74
88
|
```
|
75
89
|
|
76
90
|
#### rules
|
77
|
-
|
78
|
-
rules allows you to define a simple structure to run against a JSONPath filter
|
91
|
+
Allows you to define a simple lambda structure to run against a JSONPath filter
|
79
92
|
|
80
93
|
A rule is made up of a Hash the key is the map key field its value is a Hash with a JSONPath filter and options to apply a convert method on the filtered results.
|
81
94
|
Available convert methods are: time, map, each, call, suffix
|
82
|
-
- time:
|
95
|
+
- time: parses a given time/date string into a Time object
|
83
96
|
- map: applies a mapping to a filter
|
84
97
|
- suffix: adds a suffix to a result
|
85
98
|
- call: executes a lambda on the filter
|
data/data_collector.gemspec
CHANGED
@@ -9,8 +9,8 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ["Mehmet Celik"]
|
10
10
|
spec.email = ["mehmet@celik.be"]
|
11
11
|
|
12
|
-
spec.summary = %q{ETL library}
|
13
|
-
spec.description = %q{INPUT, FILTER, OUTPUT data}
|
12
|
+
spec.summary = %q{ETL helper library}
|
13
|
+
spec.description = %q{INPUT, FILTER, OUTPUT data with RULES and code}
|
14
14
|
spec.homepage = "https://github.com/mehmetc/data_collector"
|
15
15
|
spec.license = "MIT"
|
16
16
|
|
@@ -44,6 +44,7 @@ Gem::Specification.new do |spec|
|
|
44
44
|
spec.add_runtime_dependency "mime-types", "~> 3.2"
|
45
45
|
spec.add_runtime_dependency "minitar", "= 0.9"
|
46
46
|
spec.add_runtime_dependency "activesupport", "~> 5.2"
|
47
|
+
spec.add_runtime_dependency "json-ld", "~> 3.1"
|
47
48
|
|
48
49
|
spec.add_development_dependency "bundler", "~> 2.0"
|
49
50
|
spec.add_development_dependency "rake", "~> 10.0"
|
data/lib/data_collector/input.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
require 'http'
|
3
3
|
require 'open-uri'
|
4
4
|
require 'nokogiri'
|
5
|
-
require 'json'
|
5
|
+
require 'json/ld'
|
6
6
|
require 'nori'
|
7
7
|
require 'uri'
|
8
8
|
require 'logger'
|
@@ -15,20 +15,20 @@ require 'csv'
|
|
15
15
|
|
16
16
|
#require_relative 'ext/xml_utility_node'
|
17
17
|
module DataCollector
|
18
|
-
class Input
|
19
|
-
|
18
|
+
class Input
|
19
|
+
attr_reader :raw
|
20
20
|
|
21
|
-
|
22
|
-
|
23
|
-
|
21
|
+
def initialize
|
22
|
+
@logger = Logger.new(STDOUT)
|
23
|
+
end
|
24
24
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
25
|
+
def from_uri(source, options = {})
|
26
|
+
source = CGI.unescapeHTML(source)
|
27
|
+
@logger.info("Loading #{source}")
|
28
|
+
uri = URI(source)
|
29
|
+
begin
|
30
|
+
data = nil
|
31
|
+
case uri.scheme
|
32
32
|
when 'http'
|
33
33
|
data = from_http(uri, options)
|
34
34
|
when 'https'
|
@@ -37,42 +37,43 @@ class Input
|
|
37
37
|
data = from_file(uri, options)
|
38
38
|
else
|
39
39
|
raise "Do not know how to process #{source}"
|
40
|
-
|
40
|
+
end
|
41
41
|
|
42
|
-
|
42
|
+
data = data.nil? ? 'no data found' : data
|
43
43
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
44
|
+
if block_given?
|
45
|
+
yield data
|
46
|
+
else
|
47
|
+
data
|
48
|
+
end
|
49
|
+
rescue => e
|
50
|
+
@logger.info(e.message)
|
51
|
+
puts e.backtrace.join("\n")
|
52
|
+
nil
|
48
53
|
end
|
49
|
-
rescue => e
|
50
|
-
@logger.info(e.message)
|
51
|
-
puts e.backtrace.join("\n")
|
52
|
-
nil
|
53
54
|
end
|
54
|
-
end
|
55
55
|
|
56
|
-
|
57
|
-
def from_http(uri, options = {})
|
58
|
-
from_https(uri, options)
|
59
|
-
end
|
60
|
-
|
61
|
-
def from_https(uri, options = {})
|
62
|
-
data = nil
|
63
|
-
http = HTTP
|
56
|
+
private
|
64
57
|
|
65
|
-
|
66
|
-
|
67
|
-
password = options[:password]
|
68
|
-
http = HTTP.basic_auth(user: user, pass: password)
|
69
|
-
else
|
70
|
-
@logger.warn ("User or Password parameter not found")
|
58
|
+
def from_http(uri, options = {})
|
59
|
+
from_https(uri, options)
|
71
60
|
end
|
72
61
|
|
73
|
-
|
62
|
+
def from_https(uri, options = {})
|
63
|
+
data = nil
|
64
|
+
http = HTTP
|
65
|
+
|
66
|
+
if options.keys.include?(:user) && options.keys.include?(:password)
|
67
|
+
user = options[:user]
|
68
|
+
password = options[:password]
|
69
|
+
http = HTTP.basic_auth(user: user, pass: password)
|
70
|
+
else
|
71
|
+
@logger.warn ("User or Password parameter not found")
|
72
|
+
end
|
74
73
|
|
75
|
-
|
74
|
+
http_response = http.get(escape_uri(uri))
|
75
|
+
|
76
|
+
case http_response.code
|
76
77
|
when 200
|
77
78
|
@raw = data = http_response.body.to_s
|
78
79
|
|
@@ -80,21 +81,24 @@ class Input
|
|
80
81
|
# f.puts data
|
81
82
|
# end
|
82
83
|
|
83
|
-
file_type = file_type_from(http_response.headers)
|
84
|
+
file_type = options.with_indifferent_access.has_key?(:content_type) ? options.with_indifferent_access[:content_type] : file_type_from(http_response.headers)
|
84
85
|
|
85
86
|
unless options.with_indifferent_access.has_key?(:raw) && options.with_indifferent_access[:raw] == true
|
86
87
|
case file_type
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
88
|
+
when 'application/ld+json'
|
89
|
+
data = JSON.parse(data)
|
90
|
+
when 'application/json'
|
91
|
+
data = JSON.parse(data)
|
92
|
+
when 'application/atom+xml'
|
93
|
+
data = xml_to_hash(data)
|
94
|
+
when 'text/csv'
|
95
|
+
data = csv_to_hash(data)
|
96
|
+
when 'application/xml'
|
97
|
+
data = xml_to_hash(data)
|
98
|
+
when 'text/xml'
|
99
|
+
data = xml_to_hash(data)
|
100
|
+
else
|
101
|
+
data = xml_to_hash(data)
|
98
102
|
end
|
99
103
|
end
|
100
104
|
when 401
|
@@ -103,17 +107,19 @@ class Input
|
|
103
107
|
raise 'Not found'
|
104
108
|
else
|
105
109
|
raise "Unable to process received status code = #{http_response.code}"
|
106
|
-
|
110
|
+
end
|
107
111
|
|
108
|
-
|
109
|
-
|
112
|
+
data
|
113
|
+
end
|
110
114
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
115
|
+
def from_file(uri, options = {})
|
116
|
+
data = nil
|
117
|
+
absolute_path = File.absolute_path("#{uri.host}#{uri.path}")
|
118
|
+
unless options.has_key?('raw') && options['raw'] == true
|
119
|
+
@raw = data = File.read("#{absolute_path}")
|
120
|
+
case File.extname(absolute_path)
|
121
|
+
when '.jsonld'
|
122
|
+
data = JSON.parse(data)
|
117
123
|
when '.json'
|
118
124
|
data = JSON.parse(data)
|
119
125
|
when '.xml'
|
@@ -129,44 +135,43 @@ class Input
|
|
129
135
|
data = csv_to_hash(data)
|
130
136
|
else
|
131
137
|
raise "Do not know how to process #{uri.to_s}"
|
138
|
+
end
|
132
139
|
end
|
140
|
+
|
141
|
+
data
|
133
142
|
end
|
134
143
|
|
135
|
-
data
|
136
|
-
|
144
|
+
def xml_to_hash(data)
|
145
|
+
#gsub('<\/', '< /') outherwise wrong XML-parsing (see records lirias1729192 )
|
146
|
+
data = data.gsub /</, '< /'
|
147
|
+
nori = Nori.new(parser: :nokogiri, strip_namespaces: true, convert_tags_to: lambda { |tag| tag.gsub(/^@/, '_') })
|
148
|
+
nori.parse(data)
|
149
|
+
#JSON.parse(nori.parse(data).to_json)
|
150
|
+
end
|
137
151
|
|
138
|
-
|
139
|
-
|
140
|
-
#gsub('<\/', '< /') outherwise wrong XML-parsing (see records lirias1729192 )
|
141
|
-
data = data.gsub /</, '< /'
|
142
|
-
nori = Nori.new(parser: :nokogiri, strip_namespaces: true, convert_tags_to: lambda {|tag| tag.gsub(/^@/, '_')})
|
143
|
-
nori.parse(data)
|
144
|
-
#JSON.parse(nori.parse(data).to_json)
|
145
|
-
end
|
152
|
+
def csv_to_hash(data)
|
153
|
+
csv = CSV.parse(data, headers: true, header_converters: [:downcase, :symbol])
|
146
154
|
|
147
|
-
|
148
|
-
|
155
|
+
csv.collect do |record|
|
156
|
+
record.to_hash
|
157
|
+
end
|
158
|
+
end
|
149
159
|
|
150
|
-
|
151
|
-
|
160
|
+
def escape_uri(uri)
|
161
|
+
#"#{uri.to_s.gsub(uri.query, '')}#{CGI.escape(CGI.unescape(uri.query))}"
|
162
|
+
uri.to_s
|
152
163
|
end
|
153
|
-
end
|
154
164
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
165
|
+
def file_type_from(headers)
|
166
|
+
file_type = 'application/octet-stream'
|
167
|
+
file_type = if headers.include?('Content-Type')
|
168
|
+
headers['Content-Type'].split(';').first
|
169
|
+
else
|
170
|
+
MIME::Types.of(filename_from(headers)).first.content_type
|
171
|
+
end
|
159
172
|
|
160
|
-
|
161
|
-
|
162
|
-
file_type = if headers.include?('Content-Type')
|
163
|
-
headers['Content-Type'].split(';').first
|
164
|
-
else
|
165
|
-
MIME::Types.of(filename_from(headers)).first.content_type
|
166
|
-
end
|
173
|
+
return file_type
|
174
|
+
end
|
167
175
|
|
168
|
-
return file_type
|
169
176
|
end
|
170
|
-
|
171
|
-
end
|
172
177
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_collector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Celik
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -122,6 +122,20 @@ dependencies:
|
|
122
122
|
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '5.2'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: json-ld
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '3.1'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '3.1'
|
125
139
|
- !ruby/object:Gem::Dependency
|
126
140
|
name: bundler
|
127
141
|
requirement: !ruby/object:Gem::Requirement
|
@@ -164,7 +178,7 @@ dependencies:
|
|
164
178
|
- - "~>"
|
165
179
|
- !ruby/object:Gem::Version
|
166
180
|
version: '5.0'
|
167
|
-
description: INPUT, FILTER, OUTPUT data
|
181
|
+
description: INPUT, FILTER, OUTPUT data with RULES and code
|
168
182
|
email:
|
169
183
|
- mehmet@celik.be
|
170
184
|
executables: []
|
@@ -214,5 +228,5 @@ requirements: []
|
|
214
228
|
rubygems_version: 3.0.2
|
215
229
|
signing_key:
|
216
230
|
specification_version: 4
|
217
|
-
summary: ETL library
|
231
|
+
summary: ETL helper library
|
218
232
|
test_files: []
|