data_collector 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +17 -4
- data/data_collector.gemspec +3 -2
- data/lib/data_collector/input.rb +95 -90
- data/lib/data_collector/version.rb +1 -1
- metadata +18 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bc7b7aa820d36ad017732ae695c841013cce3a826e9042e19a2995442847a39b
|
4
|
+
data.tar.gz: 6e2e5012d09ea7cf08c967093242f2e5af7389e62cd3d631b213ba948b019b8a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fb68056b1e72c17f51de9df99487f85b01dc64fb17951419eebb7a5e35d07906f4b47093092ecf6afda5b1a415a51e12d5ad7c47fda9bae39c783b0187eb52fc
|
7
|
+
data.tar.gz: 3d3a3b35c1b011fb55bec52ea98a2291a2d3fa2d270ce68948879d4c4b1978af4f15a0bb524e5b5fa20a924252d349ea1907f25d1df99674885a23a9d4fd2a52
|
data/README.md
CHANGED
@@ -3,12 +3,26 @@ Convinience module to Extract, Transform and Load your data.
|
|
3
3
|
|
4
4
|
#### input
|
5
5
|
Read input from an URI
|
6
|
+
|
7
|
+
**Public methods**
|
8
|
+
```ruby
|
9
|
+
from_uri(source, options = {:raw, :content_type})
|
10
|
+
```
|
11
|
+
- source: an uri with a scheme of http, https, file
|
12
|
+
- options:
|
13
|
+
- raw: _boolean_ do not parse
|
14
|
+
- content_type: _string_ force a content_type if the 'Content-Type' returned by the http server is incorrect
|
15
|
+
|
6
16
|
example:
|
7
17
|
```ruby
|
8
18
|
input.from_uri("http://www.libis.be")
|
9
19
|
input.from_uri("file://hello.txt")
|
20
|
+
input.from_uri("http://www.libis.be/record.jsonld", content_type: 'application/ld+json')
|
10
21
|
```
|
11
22
|
|
23
|
+
|
24
|
+
|
25
|
+
|
12
26
|
Inputs can be JSON, XML or CSV or XML in a TAR.GZ file
|
13
27
|
|
14
28
|
### output
|
@@ -67,19 +81,18 @@ Into a temp directory
|
|
67
81
|
```
|
68
82
|
|
69
83
|
#### filter
|
70
|
-
filter data from a hash using [
|
84
|
+
filter data from a hash using [JSONPath](http://goessner.net/articles/JsonPath/index.html)
|
71
85
|
|
72
86
|
```ruby
|
73
87
|
filtered_data = filter(data, "$..metadata.record")
|
74
88
|
```
|
75
89
|
|
76
90
|
#### rules
|
77
|
-
|
78
|
-
rules allows you to define a simple structure to run against a JSONPath filter
|
91
|
+
Allows you to define a simple lambda structure to run against a JSONPath filter
|
79
92
|
|
80
93
|
A rule is made up of a Hash the key is the map key field its value is a Hash with a JSONPath filter and options to apply a convert method on the filtered results.
|
81
94
|
Available convert methods are: time, map, each, call, suffix
|
82
|
-
- time:
|
95
|
+
- time: parses a given time/date string into a Time object
|
83
96
|
- map: applies a mapping to a filter
|
84
97
|
- suffix: adds a suffix to a result
|
85
98
|
- call: executes a lambda on the filter
|
data/data_collector.gemspec
CHANGED
@@ -9,8 +9,8 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ["Mehmet Celik"]
|
10
10
|
spec.email = ["mehmet@celik.be"]
|
11
11
|
|
12
|
-
spec.summary = %q{ETL library}
|
13
|
-
spec.description = %q{INPUT, FILTER, OUTPUT data}
|
12
|
+
spec.summary = %q{ETL helper library}
|
13
|
+
spec.description = %q{INPUT, FILTER, OUTPUT data with RULES and code}
|
14
14
|
spec.homepage = "https://github.com/mehmetc/data_collector"
|
15
15
|
spec.license = "MIT"
|
16
16
|
|
@@ -44,6 +44,7 @@ Gem::Specification.new do |spec|
|
|
44
44
|
spec.add_runtime_dependency "mime-types", "~> 3.2"
|
45
45
|
spec.add_runtime_dependency "minitar", "= 0.9"
|
46
46
|
spec.add_runtime_dependency "activesupport", "~> 5.2"
|
47
|
+
spec.add_runtime_dependency "json-ld", "~> 3.1"
|
47
48
|
|
48
49
|
spec.add_development_dependency "bundler", "~> 2.0"
|
49
50
|
spec.add_development_dependency "rake", "~> 10.0"
|
data/lib/data_collector/input.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
require 'http'
|
3
3
|
require 'open-uri'
|
4
4
|
require 'nokogiri'
|
5
|
-
require 'json'
|
5
|
+
require 'json/ld'
|
6
6
|
require 'nori'
|
7
7
|
require 'uri'
|
8
8
|
require 'logger'
|
@@ -15,20 +15,20 @@ require 'csv'
|
|
15
15
|
|
16
16
|
#require_relative 'ext/xml_utility_node'
|
17
17
|
module DataCollector
|
18
|
-
class Input
|
19
|
-
|
18
|
+
class Input
|
19
|
+
attr_reader :raw
|
20
20
|
|
21
|
-
|
22
|
-
|
23
|
-
|
21
|
+
def initialize
|
22
|
+
@logger = Logger.new(STDOUT)
|
23
|
+
end
|
24
24
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
25
|
+
def from_uri(source, options = {})
|
26
|
+
source = CGI.unescapeHTML(source)
|
27
|
+
@logger.info("Loading #{source}")
|
28
|
+
uri = URI(source)
|
29
|
+
begin
|
30
|
+
data = nil
|
31
|
+
case uri.scheme
|
32
32
|
when 'http'
|
33
33
|
data = from_http(uri, options)
|
34
34
|
when 'https'
|
@@ -37,42 +37,43 @@ class Input
|
|
37
37
|
data = from_file(uri, options)
|
38
38
|
else
|
39
39
|
raise "Do not know how to process #{source}"
|
40
|
-
|
40
|
+
end
|
41
41
|
|
42
|
-
|
42
|
+
data = data.nil? ? 'no data found' : data
|
43
43
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
44
|
+
if block_given?
|
45
|
+
yield data
|
46
|
+
else
|
47
|
+
data
|
48
|
+
end
|
49
|
+
rescue => e
|
50
|
+
@logger.info(e.message)
|
51
|
+
puts e.backtrace.join("\n")
|
52
|
+
nil
|
48
53
|
end
|
49
|
-
rescue => e
|
50
|
-
@logger.info(e.message)
|
51
|
-
puts e.backtrace.join("\n")
|
52
|
-
nil
|
53
54
|
end
|
54
|
-
end
|
55
55
|
|
56
|
-
|
57
|
-
def from_http(uri, options = {})
|
58
|
-
from_https(uri, options)
|
59
|
-
end
|
60
|
-
|
61
|
-
def from_https(uri, options = {})
|
62
|
-
data = nil
|
63
|
-
http = HTTP
|
56
|
+
private
|
64
57
|
|
65
|
-
|
66
|
-
|
67
|
-
password = options[:password]
|
68
|
-
http = HTTP.basic_auth(user: user, pass: password)
|
69
|
-
else
|
70
|
-
@logger.warn ("User or Password parameter not found")
|
58
|
+
def from_http(uri, options = {})
|
59
|
+
from_https(uri, options)
|
71
60
|
end
|
72
61
|
|
73
|
-
|
62
|
+
def from_https(uri, options = {})
|
63
|
+
data = nil
|
64
|
+
http = HTTP
|
65
|
+
|
66
|
+
if options.keys.include?(:user) && options.keys.include?(:password)
|
67
|
+
user = options[:user]
|
68
|
+
password = options[:password]
|
69
|
+
http = HTTP.basic_auth(user: user, pass: password)
|
70
|
+
else
|
71
|
+
@logger.warn ("User or Password parameter not found")
|
72
|
+
end
|
74
73
|
|
75
|
-
|
74
|
+
http_response = http.get(escape_uri(uri))
|
75
|
+
|
76
|
+
case http_response.code
|
76
77
|
when 200
|
77
78
|
@raw = data = http_response.body.to_s
|
78
79
|
|
@@ -80,21 +81,24 @@ class Input
|
|
80
81
|
# f.puts data
|
81
82
|
# end
|
82
83
|
|
83
|
-
file_type = file_type_from(http_response.headers)
|
84
|
+
file_type = options.with_indifferent_access.has_key?(:content_type) ? options.with_indifferent_access[:content_type] : file_type_from(http_response.headers)
|
84
85
|
|
85
86
|
unless options.with_indifferent_access.has_key?(:raw) && options.with_indifferent_access[:raw] == true
|
86
87
|
case file_type
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
88
|
+
when 'application/ld+json'
|
89
|
+
data = JSON.parse(data)
|
90
|
+
when 'application/json'
|
91
|
+
data = JSON.parse(data)
|
92
|
+
when 'application/atom+xml'
|
93
|
+
data = xml_to_hash(data)
|
94
|
+
when 'text/csv'
|
95
|
+
data = csv_to_hash(data)
|
96
|
+
when 'application/xml'
|
97
|
+
data = xml_to_hash(data)
|
98
|
+
when 'text/xml'
|
99
|
+
data = xml_to_hash(data)
|
100
|
+
else
|
101
|
+
data = xml_to_hash(data)
|
98
102
|
end
|
99
103
|
end
|
100
104
|
when 401
|
@@ -103,17 +107,19 @@ class Input
|
|
103
107
|
raise 'Not found'
|
104
108
|
else
|
105
109
|
raise "Unable to process received status code = #{http_response.code}"
|
106
|
-
|
110
|
+
end
|
107
111
|
|
108
|
-
|
109
|
-
|
112
|
+
data
|
113
|
+
end
|
110
114
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
115
|
+
def from_file(uri, options = {})
|
116
|
+
data = nil
|
117
|
+
absolute_path = File.absolute_path("#{uri.host}#{uri.path}")
|
118
|
+
unless options.has_key?('raw') && options['raw'] == true
|
119
|
+
@raw = data = File.read("#{absolute_path}")
|
120
|
+
case File.extname(absolute_path)
|
121
|
+
when '.jsonld'
|
122
|
+
data = JSON.parse(data)
|
117
123
|
when '.json'
|
118
124
|
data = JSON.parse(data)
|
119
125
|
when '.xml'
|
@@ -129,44 +135,43 @@ class Input
|
|
129
135
|
data = csv_to_hash(data)
|
130
136
|
else
|
131
137
|
raise "Do not know how to process #{uri.to_s}"
|
138
|
+
end
|
132
139
|
end
|
140
|
+
|
141
|
+
data
|
133
142
|
end
|
134
143
|
|
135
|
-
data
|
136
|
-
|
144
|
+
def xml_to_hash(data)
|
145
|
+
#gsub('<\/', '< /') outherwise wrong XML-parsing (see records lirias1729192 )
|
146
|
+
data = data.gsub /</, '< /'
|
147
|
+
nori = Nori.new(parser: :nokogiri, strip_namespaces: true, convert_tags_to: lambda { |tag| tag.gsub(/^@/, '_') })
|
148
|
+
nori.parse(data)
|
149
|
+
#JSON.parse(nori.parse(data).to_json)
|
150
|
+
end
|
137
151
|
|
138
|
-
|
139
|
-
|
140
|
-
#gsub('<\/', '< /') outherwise wrong XML-parsing (see records lirias1729192 )
|
141
|
-
data = data.gsub /</, '< /'
|
142
|
-
nori = Nori.new(parser: :nokogiri, strip_namespaces: true, convert_tags_to: lambda {|tag| tag.gsub(/^@/, '_')})
|
143
|
-
nori.parse(data)
|
144
|
-
#JSON.parse(nori.parse(data).to_json)
|
145
|
-
end
|
152
|
+
def csv_to_hash(data)
|
153
|
+
csv = CSV.parse(data, headers: true, header_converters: [:downcase, :symbol])
|
146
154
|
|
147
|
-
|
148
|
-
|
155
|
+
csv.collect do |record|
|
156
|
+
record.to_hash
|
157
|
+
end
|
158
|
+
end
|
149
159
|
|
150
|
-
|
151
|
-
|
160
|
+
def escape_uri(uri)
|
161
|
+
#"#{uri.to_s.gsub(uri.query, '')}#{CGI.escape(CGI.unescape(uri.query))}"
|
162
|
+
uri.to_s
|
152
163
|
end
|
153
|
-
end
|
154
164
|
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
165
|
+
def file_type_from(headers)
|
166
|
+
file_type = 'application/octet-stream'
|
167
|
+
file_type = if headers.include?('Content-Type')
|
168
|
+
headers['Content-Type'].split(';').first
|
169
|
+
else
|
170
|
+
MIME::Types.of(filename_from(headers)).first.content_type
|
171
|
+
end
|
159
172
|
|
160
|
-
|
161
|
-
|
162
|
-
file_type = if headers.include?('Content-Type')
|
163
|
-
headers['Content-Type'].split(';').first
|
164
|
-
else
|
165
|
-
MIME::Types.of(filename_from(headers)).first.content_type
|
166
|
-
end
|
173
|
+
return file_type
|
174
|
+
end
|
167
175
|
|
168
|
-
return file_type
|
169
176
|
end
|
170
|
-
|
171
|
-
end
|
172
177
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_collector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Mehmet Celik
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-01-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -122,6 +122,20 @@ dependencies:
|
|
122
122
|
- - "~>"
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '5.2'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: json-ld
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - "~>"
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '3.1'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - "~>"
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '3.1'
|
125
139
|
- !ruby/object:Gem::Dependency
|
126
140
|
name: bundler
|
127
141
|
requirement: !ruby/object:Gem::Requirement
|
@@ -164,7 +178,7 @@ dependencies:
|
|
164
178
|
- - "~>"
|
165
179
|
- !ruby/object:Gem::Version
|
166
180
|
version: '5.0'
|
167
|
-
description: INPUT, FILTER, OUTPUT data
|
181
|
+
description: INPUT, FILTER, OUTPUT data with RULES and code
|
168
182
|
email:
|
169
183
|
- mehmet@celik.be
|
170
184
|
executables: []
|
@@ -214,5 +228,5 @@ requirements: []
|
|
214
228
|
rubygems_version: 3.0.2
|
215
229
|
signing_key:
|
216
230
|
specification_version: 4
|
217
|
-
summary: ETL library
|
231
|
+
summary: ETL helper library
|
218
232
|
test_files: []
|