ingestor 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +0 -2
- data/README.md +36 -2
- data/examples/{xml_parsing.rb → books_xml.rb} +0 -0
- data/examples/colors_json.rb +57 -0
- data/examples/{text_parsing.rb → hotel_chains_plain_text.rb} +0 -0
- data/examples/people_json.rb +55 -0
- data/ingestor.gemspec +2 -0
- data/lib/ingestor/parser/json.rb +25 -0
- data/lib/ingestor/parser/xml.rb +7 -5
- data/lib/ingestor/version.rb +1 -1
- data/samples/people.json +6 -6
- data/spec/lib/ingestor/parser/json_spec.rb +24 -4
- data/spec/lib/ingestor/parser/xml_spec.rb +0 -2
- metadata +35 -11
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -99,6 +99,33 @@ Add the following to your Rakefile
|
|
99
99
|
}
|
100
100
|
end
|
101
101
|
|
102
|
+
JSON Example
|
103
|
+
|
104
|
+
ingest("http://example.com/people.json") do
|
105
|
+
parser :json
|
106
|
+
parser_options collection: lambda{|document|
|
107
|
+
document['people']
|
108
|
+
}
|
109
|
+
map_attributes do |values|
|
110
|
+
{
|
111
|
+
name: values["first_name"] + " " + values["last_name"]
|
112
|
+
age: values['age'],
|
113
|
+
address: values['address']
|
114
|
+
}
|
115
|
+
end
|
116
|
+
|
117
|
+
# current lines values
|
118
|
+
finder{|attrs|
|
119
|
+
Person.where(name: attrs[:name]).first || Person.new
|
120
|
+
}
|
121
|
+
|
122
|
+
processor{|attrs,record|
|
123
|
+
record.update_attributes(attrs)
|
124
|
+
record.send_junk_mail!
|
125
|
+
}
|
126
|
+
end
|
127
|
+
|
128
|
+
|
102
129
|
## Advanced Usage
|
103
130
|
DSL Options
|
104
131
|
* parser - the parser to use on the file
|
@@ -178,7 +205,13 @@ Writing parsers is simple ([see examples](https://github.com/coryodaniel/ingesto
|
|
178
205
|
* Default libxml2 best guess
|
179
206
|
|
180
207
|
### JSON Parser
|
181
|
-
|
208
|
+
Parses a JSON document
|
209
|
+
|
210
|
+
Options
|
211
|
+
* collection - receives the document and narrows it down to the collection you are interested in
|
212
|
+
* Proc(Hash)
|
213
|
+
* Returns Hash | Array
|
214
|
+
* Required
|
182
215
|
|
183
216
|
### CSV Parser
|
184
217
|
Coming soon...
|
@@ -203,9 +236,10 @@ Coming soon...
|
|
203
236
|
|
204
237
|
|
205
238
|
## Todos
|
206
|
-
* rdoc
|
239
|
+
* rdoc http://rdoc.rubyforge.org/RDoc/Markup.html
|
207
240
|
* Move includes_header to CSV, PlainText
|
208
241
|
* Mongoid Support
|
209
242
|
* sort/limit options
|
243
|
+
* configure travis
|
210
244
|
* A way to sample a file without building an ingestor first
|
211
245
|
* bin/ingestor --sample --path=./my.xml --parser xml --parser_options_xpath '//book'
|
File without changes
|
@@ -0,0 +1,57 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'ingestor'
|
5
|
+
require 'ingestor/parser/json'
|
6
|
+
|
7
|
+
# Set up a bogus active model
|
8
|
+
require 'active_model'
|
9
|
+
class Color
|
10
|
+
include ActiveModel::Naming
|
11
|
+
def persisted?
|
12
|
+
true
|
13
|
+
end
|
14
|
+
# Make a fake active model
|
15
|
+
attr_accessor :name, :hex
|
16
|
+
def update_attributes(attributes = {})
|
17
|
+
attributes.each do |name, value|
|
18
|
+
send("#{name}=", value)
|
19
|
+
end
|
20
|
+
true
|
21
|
+
end
|
22
|
+
end
|
23
|
+
# end bogusness
|
24
|
+
|
25
|
+
ingest "./samples/colors.json" do
|
26
|
+
parser :json
|
27
|
+
|
28
|
+
# Receives the full document and narrows it down to the collection to process.
|
29
|
+
parser_options collection: lambda{|document|
|
30
|
+
document
|
31
|
+
}
|
32
|
+
|
33
|
+
# How to map out the columns from text to AR
|
34
|
+
map_attributes do |values|
|
35
|
+
{
|
36
|
+
name: values['color'],
|
37
|
+
hex: values['value']
|
38
|
+
}
|
39
|
+
end
|
40
|
+
|
41
|
+
# before{|attrs| values}
|
42
|
+
|
43
|
+
# Your strategy for finding or instantiating a new object to be handled by the processor block
|
44
|
+
finder{|attrs|
|
45
|
+
# Book.find( attrs['id'] ) || Book.new
|
46
|
+
Color.new
|
47
|
+
}
|
48
|
+
|
49
|
+
processor{|attrs,record|
|
50
|
+
# ... custom processor here ...
|
51
|
+
record.update_attributes attrs
|
52
|
+
}
|
53
|
+
|
54
|
+
after{|record|
|
55
|
+
puts "Created: #{record.name}"
|
56
|
+
}
|
57
|
+
end
|
File without changes
|
@@ -0,0 +1,55 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'ingestor'
|
5
|
+
require 'ingestor/parser/json'
|
6
|
+
|
7
|
+
# Set up a bogus active model
|
8
|
+
require 'active_model'
|
9
|
+
class Person
|
10
|
+
include ActiveModel::Naming
|
11
|
+
def persisted?
|
12
|
+
true
|
13
|
+
end
|
14
|
+
# Make a fake active model
|
15
|
+
attr_accessor :id, :first_name, :last_name, :age, :address
|
16
|
+
def update_attributes(attributes = {})
|
17
|
+
attributes.each do |name, value|
|
18
|
+
send("#{name}=", value)
|
19
|
+
end
|
20
|
+
true
|
21
|
+
end
|
22
|
+
end
|
23
|
+
# end bogusness
|
24
|
+
|
25
|
+
ingest "./samples/people.json" do
|
26
|
+
parser :json
|
27
|
+
|
28
|
+
# Receives the full document and narrows it down to the collection to process.
|
29
|
+
parser_options collection: lambda{|document|
|
30
|
+
document['people']
|
31
|
+
}
|
32
|
+
#sample true
|
33
|
+
|
34
|
+
# How to map out the columns from text to AR
|
35
|
+
map_attributes do |values|
|
36
|
+
values
|
37
|
+
end
|
38
|
+
|
39
|
+
# before{|attrs| values}
|
40
|
+
|
41
|
+
# Your strategy for finding or instantiating a new object to be handled by the processor block
|
42
|
+
finder{|attrs|
|
43
|
+
# Book.find( attrs['id'] ) || Book.new
|
44
|
+
Person.new
|
45
|
+
}
|
46
|
+
|
47
|
+
processor{|attrs,record|
|
48
|
+
# ... custom processor here ...
|
49
|
+
record.update_attributes attrs
|
50
|
+
}
|
51
|
+
|
52
|
+
after{|record|
|
53
|
+
puts "Created: #{record.first_name} @ #{record.address}"
|
54
|
+
}
|
55
|
+
end
|
data/ingestor.gemspec
CHANGED
@@ -19,5 +19,7 @@ Gem::Specification.new do |gem|
|
|
19
19
|
gem.add_dependency "docile"
|
20
20
|
gem.add_dependency "rubyzip"
|
21
21
|
gem.add_dependency "thor"
|
22
|
+
gem.add_dependency "nokogiri", '~> 1.5.6'
|
22
23
|
gem.add_dependency "activesupport", '>= 3.2.0'
|
24
|
+
gem.add_dependency 'multi_json', '~> 1.0'
|
23
25
|
end
|
data/lib/ingestor/parser/json.rb
CHANGED
@@ -1,6 +1,31 @@
|
|
1
|
+
require 'multi_json'
|
2
|
+
require 'debugger'
|
1
3
|
module Ingestor
|
2
4
|
module Parser
|
3
5
|
class Json
|
6
|
+
include Ingestor::Parser::Base
|
7
|
+
def options(opts={})
|
8
|
+
@options = {
|
9
|
+
collection: nil
|
10
|
+
}.merge(opts)
|
11
|
+
end
|
12
|
+
|
13
|
+
def sample!
|
14
|
+
puts @options[:collection].call(document).first
|
15
|
+
#puts @options[:collection] ? @options[:collection].call(document).first : document.first
|
16
|
+
end
|
17
|
+
|
18
|
+
def process!
|
19
|
+
@options[:collection].call(document).each do |attrs|
|
20
|
+
@proxy.process_entry @proxy.options[:map_attributes].call( attrs )
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
protected
|
25
|
+
|
26
|
+
def document
|
27
|
+
MultiJson.load(@document.read)
|
28
|
+
end
|
4
29
|
end
|
5
30
|
end
|
6
31
|
end
|
data/lib/ingestor/parser/xml.rb
CHANGED
@@ -17,19 +17,21 @@ module Ingestor
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def sample!
|
20
|
-
|
21
|
-
puts Hash.from_xml( doc.xpath(@options[:xpath]).first.to_s )
|
20
|
+
puts Hash.from_xml( document.xpath(@options[:xpath]).first.to_s )
|
22
21
|
end
|
23
22
|
|
24
23
|
def process!
|
25
|
-
|
26
|
-
|
27
|
-
doc.xpath(@options[:xpath]).each do |node|
|
24
|
+
document.xpath(@options[:xpath]).each do |node|
|
28
25
|
node_attrs = Hash.from_xml(node.to_s)
|
29
26
|
attrs = @proxy.options[:map_attributes].call( node_attrs )
|
30
27
|
@proxy.process_entry attrs
|
31
28
|
end
|
32
29
|
end
|
30
|
+
|
31
|
+
protected
|
32
|
+
def document
|
33
|
+
Nokogiri::XML(@document, nil, @options[:encoding])
|
34
|
+
end
|
33
35
|
end
|
34
36
|
end
|
35
37
|
end
|
data/lib/ingestor/version.rb
CHANGED
data/samples/people.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
{"people":
|
2
2
|
[
|
3
3
|
{
|
4
|
-
"
|
5
|
-
"
|
4
|
+
"first_name": "John",
|
5
|
+
"last_name": "Smith",
|
6
6
|
"age": 25,
|
7
7
|
"address": {
|
8
8
|
"streetAddress": "21 2nd Street",
|
@@ -12,11 +12,11 @@
|
|
12
12
|
}
|
13
13
|
},
|
14
14
|
{
|
15
|
-
"
|
16
|
-
"
|
17
|
-
"age":
|
15
|
+
"first_name": "Joanne",
|
16
|
+
"last_name": "Smith",
|
17
|
+
"age": 93,
|
18
18
|
"address": {
|
19
|
-
"streetAddress": "
|
19
|
+
"streetAddress": "Westchester Street",
|
20
20
|
"city": "New York",
|
21
21
|
"state": "NY",
|
22
22
|
"postalCode": 10021
|
@@ -1,5 +1,25 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'ingestor/parser/json'
|
1
3
|
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
4
|
+
describe Ingestor::Parser::Json do
|
5
|
+
describe '#process!' do
|
6
|
+
before do
|
7
|
+
@proxy = ingest("./samples/people.json") do
|
8
|
+
parser :json
|
9
|
+
parser_options collection: lambda{|document|
|
10
|
+
document['people']
|
11
|
+
}
|
12
|
+
finder{|attrs| Dummy.new}
|
13
|
+
map_attributes{|values|
|
14
|
+
{
|
15
|
+
:name => [ values['first_name'], values["last_name"] ].join(' ')
|
16
|
+
}
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'should be able to process a JSON file' do
|
22
|
+
Dummy.first.name.should eq "John Smith"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -11,14 +11,12 @@ describe Ingestor::Parser::Xml do
|
|
11
11
|
})
|
12
12
|
finder{|attrs| Dummy.new}
|
13
13
|
map_attributes{|values|
|
14
|
-
puts values
|
15
14
|
{:name => values['book']['title']}
|
16
15
|
}
|
17
16
|
end
|
18
17
|
end
|
19
18
|
|
20
19
|
it 'should be able to process an XML file' do
|
21
|
-
|
22
20
|
Dummy.first.name.should eq "XML Developer's Guide"
|
23
21
|
end
|
24
22
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ingestor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2013-02-22 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: docile
|
16
|
-
requirement: &
|
16
|
+
requirement: &70218534200000 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70218534200000
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rubyzip
|
27
|
-
requirement: &
|
27
|
+
requirement: &70218534199320 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70218534199320
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: thor
|
38
|
-
requirement: &
|
38
|
+
requirement: &70218534198520 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,21 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70218534198520
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: nokogiri
|
49
|
+
requirement: &70218534197600 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.5.6
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *70218534197600
|
47
58
|
- !ruby/object:Gem::Dependency
|
48
59
|
name: activesupport
|
49
|
-
requirement: &
|
60
|
+
requirement: &70218534196500 !ruby/object:Gem::Requirement
|
50
61
|
none: false
|
51
62
|
requirements:
|
52
63
|
- - ! '>='
|
@@ -54,7 +65,18 @@ dependencies:
|
|
54
65
|
version: 3.2.0
|
55
66
|
type: :runtime
|
56
67
|
prerelease: false
|
57
|
-
version_requirements: *
|
68
|
+
version_requirements: *70218534196500
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: multi_json
|
71
|
+
requirement: &70218534195400 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ~>
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '1.0'
|
77
|
+
type: :runtime
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: *70218534195400
|
58
80
|
description: Ingesting local and remote data files into ActiveRecord
|
59
81
|
email:
|
60
82
|
- github@coryodaniel.com
|
@@ -70,8 +92,10 @@ files:
|
|
70
92
|
- README.md
|
71
93
|
- Rakefile
|
72
94
|
- bin/ingest
|
73
|
-
- examples/
|
74
|
-
- examples/
|
95
|
+
- examples/books_xml.rb
|
96
|
+
- examples/colors_json.rb
|
97
|
+
- examples/hotel_chains_plain_text.rb
|
98
|
+
- examples/people_json.rb
|
75
99
|
- ingestor.gemspec
|
76
100
|
- lib/ingestor.rb
|
77
101
|
- lib/ingestor/dsl.rb
|