ingestor 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +21 -0
- data/Gemfile +18 -0
- data/Guardfile +11 -0
- data/LICENSE.txt +22 -0
- data/README.md +211 -0
- data/Rakefile +7 -0
- data/bin/ingest +73 -0
- data/examples/text_parsing.rb +56 -0
- data/examples/xml_parsing.rb +52 -0
- data/ingestor.gemspec +23 -0
- data/lib/ingestor.rb +37 -0
- data/lib/ingestor/dsl.rb +110 -0
- data/lib/ingestor/parser/base.rb +28 -0
- data/lib/ingestor/parser/csv.rb +8 -0
- data/lib/ingestor/parser/json.rb +8 -0
- data/lib/ingestor/parser/plain_text.rb +44 -0
- data/lib/ingestor/parser/xml.rb +37 -0
- data/lib/ingestor/proxy.rb +113 -0
- data/lib/ingestor/tasks.rb +15 -0
- data/lib/ingestor/version.rb +3 -0
- data/samples/animals.csv +7 -0
- data/samples/books.xml +32 -0
- data/samples/colors.json +30 -0
- data/samples/flags.txt +12 -0
- data/samples/people.json +26 -0
- data/spec/cassettes/remote-zipped-files.yml +186 -0
- data/spec/lib/ingestor/dsl_spec.rb +114 -0
- data/spec/lib/ingestor/parser/csv_spec.rb +5 -0
- data/spec/lib/ingestor/parser/json_spec.rb +5 -0
- data/spec/lib/ingestor/parser/plain_text_spec.rb +24 -0
- data/spec/lib/ingestor/parser/xml_spec.rb +25 -0
- data/spec/lib/ingestor/proxy_spec.rb +129 -0
- data/spec/lib/ingestor_spec.rb +19 -0
- data/spec/orm/active_record.rb +33 -0
- data/spec/orm/database.example.yml +15 -0
- data/spec/spec_helper.rb +21 -0
- metadata +139 -0
data/.gitignore
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
log
|
14
|
+
rdoc
|
15
|
+
spec/reports
|
16
|
+
test/tmp
|
17
|
+
test/version_tmp
|
18
|
+
tmp
|
19
|
+
spec/orm/database.yml
|
20
|
+
.DS_Store
|
21
|
+
**/.DS_Store
|
data/Gemfile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in ingestor.gemspec
|
4
|
+
gemspec
|
5
|
+
gem 'rspec', '2.12.0'
|
6
|
+
gem 'rb-fsevent'
|
7
|
+
gem "guard", '1.6.1'
|
8
|
+
gem "guard-bundler"
|
9
|
+
gem "guard-rspec"
|
10
|
+
gem 'debugger', '1.2.0'
|
11
|
+
gem 'vcr'
|
12
|
+
gem 'fakeweb'
|
13
|
+
gem 'nokogiri'
|
14
|
+
gem 'activerecord', "~>3.2.0"
|
15
|
+
gem 'activesupport', "~>3.2.0"
|
16
|
+
gem 'mysql2'
|
17
|
+
gem 'rb-fsevent'
|
18
|
+
gem 'ruby_gntp'
|
data/Guardfile
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
guard 'bundler' do
|
2
|
+
watch('Gemfile')
|
3
|
+
# Uncomment next line if Gemfile contain `gemspec' command
|
4
|
+
# watch(/^.+\.gemspec/)
|
5
|
+
end
|
6
|
+
|
7
|
+
guard 'rspec', :version => 2 do
|
8
|
+
watch(%r{^spec/.+_spec\.rb$})
|
9
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
10
|
+
watch('spec/spec_helper.rb') { "spec" }
|
11
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Cory O'Daniel
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,211 @@
|
|
1
|
+
# Ingestor
|
2
|
+
|
3
|
+
A simple DSL for importing data from text and csv files to ActiveRecord. This was originally designed to
|
4
|
+
continually import changing data from EAN and Geonames.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
gem 'ingestor'
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
$ bundle install
|
15
|
+
|
16
|
+
Or install it yourself as:
|
17
|
+
|
18
|
+
$ gem install ingestor
|
19
|
+
|
20
|
+
Add the following to your Rakefile
|
21
|
+
require 'ingestor/tasks'
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
Given a text file:
|
26
|
+
|
27
|
+
id|name|population
|
28
|
+
1|China|1,354,040,000
|
29
|
+
2|India|1,210,193,422
|
30
|
+
3|United States|315,550,000
|
31
|
+
|
32
|
+
And an AR Class:
|
33
|
+
|
34
|
+
class Country
|
35
|
+
attr_accessible :name, :population
|
36
|
+
end
|
37
|
+
|
38
|
+
Sync the file with AR:
|
39
|
+
|
40
|
+
ingest("path/to/countries.txt") do
|
41
|
+
map_attributes do |values|
|
42
|
+
{
|
43
|
+
id: values[0],
|
44
|
+
name: values[1],
|
45
|
+
population: values[2]
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
# current lines values
|
50
|
+
finder{|attrs|
|
51
|
+
Country.where(id: attrs[:id]).first || Country.new
|
52
|
+
}
|
53
|
+
end
|
54
|
+
|
55
|
+
It can handle remote files and zip files as well.
|
56
|
+
|
57
|
+
ingest("http://example.com/a_lot_of_countries.zip") do
|
58
|
+
compressed true
|
59
|
+
map_attributes do |values|
|
60
|
+
{
|
61
|
+
id: values[0],
|
62
|
+
name: values[1],
|
63
|
+
population: values[2]
|
64
|
+
}
|
65
|
+
end
|
66
|
+
|
67
|
+
# current lines values
|
68
|
+
finder{|attrs|
|
69
|
+
Country.where(id: attrs[:id]).first || Country.new
|
70
|
+
}
|
71
|
+
end
|
72
|
+
|
73
|
+
It can handle XML, JSON, and more...
|
74
|
+
|
75
|
+
ingest("http://example.com/books.xml") do
|
76
|
+
parser :xml
|
77
|
+
parser_options xpath: '//book'
|
78
|
+
map_attributes do |values|
|
79
|
+
{
|
80
|
+
id: values['id'],
|
81
|
+
title: values['title'],
|
82
|
+
author: {
|
83
|
+
name: values['author']
|
84
|
+
}
|
85
|
+
}
|
86
|
+
end
|
87
|
+
|
88
|
+
# current lines values
|
89
|
+
finder{|attrs|
|
90
|
+
Book.where(id: attrs[:id]).first || Book.new
|
91
|
+
}
|
92
|
+
|
93
|
+
processor{|attrs,record|
|
94
|
+
record.update_attributes(attrs)
|
95
|
+
record.reviews.create({
|
96
|
+
stars: 5,
|
97
|
+
comment: "Every book they sell is so great!"
|
98
|
+
})
|
99
|
+
}
|
100
|
+
end
|
101
|
+
|
102
|
+
## Advanced Usage
|
103
|
+
DSL Options
|
104
|
+
* parser - the parser to use on the file
|
105
|
+
* Symbol
|
106
|
+
* Optional
|
107
|
+
* Default: :plain_text
|
108
|
+
* Available Values: :plain_text, :xml, :json, :csv, :html
|
109
|
+
* See 'Included Parsers' below
|
110
|
+
* parser_options - options for a specific parser
|
111
|
+
* Hash
|
112
|
+
* Optional
|
113
|
+
* Default: set per parser
|
114
|
+
* See 'Included Parsers' below
|
115
|
+
* sample - dump a single raw entry from the file to STDOUT and exit
|
116
|
+
* Boolean
|
117
|
+
* Optional
|
118
|
+
* Default: false
|
119
|
+
(defaults: false) will
|
120
|
+
* includes_header - Tells the parser that the first line is a header and should be ignored
|
121
|
+
* Boolean
|
122
|
+
* Optional
|
123
|
+
* Default: false
|
124
|
+
* compressed - Should the file be decompressed
|
125
|
+
* Boolean
|
126
|
+
* Optional
|
127
|
+
* Default: false
|
128
|
+
* working_directory - where to store remote or decompressed files for local processing
|
129
|
+
* String
|
130
|
+
* Optional
|
131
|
+
* Default: /tmp/ingestor
|
132
|
+
* before - callback that receives attributes for each record BEFORE call to [finder]
|
133
|
+
* Proc(attributes)
|
134
|
+
* Optional
|
135
|
+
* Default: nil
|
136
|
+
* finder - Arel finder for each object
|
137
|
+
* Proc(attributes)
|
138
|
+
* Returns: ~ActiveModel
|
139
|
+
* Required
|
140
|
+
* processor - What to do with the attributes and object
|
141
|
+
* Proc(attributes,record)
|
142
|
+
* Returns: ~ActiveModel
|
143
|
+
* Optional
|
144
|
+
* Default: Proc, calls #update_attributes on record without protection
|
145
|
+
* after - callback that receives each record after [processor]
|
146
|
+
* Proc(record)
|
147
|
+
* Optional
|
148
|
+
|
149
|
+
|
150
|
+
## Included Parsers
|
151
|
+
|
152
|
+
Writing parsers is simple ([see examples](https://github.com/coryodaniel/ingestor/tree/master/lib/ingestor/parser])).
|
153
|
+
|
154
|
+
### Plain Text Parser
|
155
|
+
Parses a plain text document.
|
156
|
+
|
157
|
+
Options
|
158
|
+
* delimiter - how to split up each line
|
159
|
+
* String
|
160
|
+
* Default: '|'
|
161
|
+
* Optional
|
162
|
+
* line\_processor - override default\_line\_processor. The default\_line\_processor simply splits the string using the delimiter
|
163
|
+
* Proc(string)
|
164
|
+
* Returns Array
|
165
|
+
* Default: nil
|
166
|
+
* Optional
|
167
|
+
|
168
|
+
### XML Parser
|
169
|
+
Parses an XML document
|
170
|
+
|
171
|
+
Options
|
172
|
+
* selector - xpath selector to get the node collection
|
173
|
+
* String
|
174
|
+
* Required
|
175
|
+
* encoding - XML Encoding. See nokogiri encoding
|
176
|
+
* String
|
177
|
+
* Optional
|
178
|
+
* Default libxml2 best guess
|
179
|
+
|
180
|
+
### JSON Parser
|
181
|
+
Coming soon...
|
182
|
+
|
183
|
+
### CSV Parser
|
184
|
+
Coming soon...
|
185
|
+
|
186
|
+
### HTML Parser
|
187
|
+
Coming soon...
|
188
|
+
|
189
|
+
|
190
|
+
## Contributing
|
191
|
+
|
192
|
+
1. Fork it
|
193
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
194
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
195
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
196
|
+
5. Create new Pull Request
|
197
|
+
|
198
|
+
## Running Tests
|
199
|
+
|
200
|
+
1. Copy spec/orm/database.example.yml => spec/orm/database.yml
|
201
|
+
2. Configure spec/orm/database.yml
|
202
|
+
3. bundle exec guard
|
203
|
+
|
204
|
+
|
205
|
+
## Todos
|
206
|
+
* rdoc lib/ folder
|
207
|
+
* Move includes_header to CSV, PlainText
|
208
|
+
* Mongoid Support
|
209
|
+
* sort/limit options
|
210
|
+
* A way to sample a file without building an ingestor first
|
211
|
+
* bin/ingestor --sample --path=./my.xml --parser xml --parser_options_xpath '//book'
|
data/Rakefile
ADDED
data/bin/ingest
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'ingestor'
|
4
|
+
require 'thor'
|
5
|
+
require 'active_support/inflector'
|
6
|
+
|
7
|
+
module Ingestor
|
8
|
+
class CommandLine < Thor
|
9
|
+
|
10
|
+
desc "generate PATH", "Create a new ingestor for PATH. PATH should be an absolute path or URL"
|
11
|
+
def generate(path)
|
12
|
+
output_directory = 'script/ingestors'
|
13
|
+
FileUtils.mkdir_p(output_directory, mode: 0755) unless Dir.exists?(output_directory)
|
14
|
+
|
15
|
+
file_name = File.basename(path).underscore.parameterize.underscore
|
16
|
+
file_name = %Q{#{Time.now.utc.strftime("%Y%m%d%H%M%S")}_#{file_name}.rb}
|
17
|
+
|
18
|
+
generated_file = File.join( output_directory, file_name )
|
19
|
+
|
20
|
+
File.open(generated_file, 'w+') do |f|
|
21
|
+
f.puts <<-HEREDOC
|
22
|
+
#! /usr/bin/env ruby
|
23
|
+
require 'ingestor'
|
24
|
+
|
25
|
+
######################################
|
26
|
+
#
|
27
|
+
# Order of block execution is:
|
28
|
+
# * map_attributes
|
29
|
+
# * before
|
30
|
+
# * finder
|
31
|
+
# * processor
|
32
|
+
# * after
|
33
|
+
#
|
34
|
+
######################################
|
35
|
+
|
36
|
+
ingest "#{path}" do
|
37
|
+
parser :plain_text
|
38
|
+
# compressed true # is the file compressed
|
39
|
+
# includes_header false
|
40
|
+
# parser_options delimiter: '|' # parser specific
|
41
|
+
# working_directory '/tmp/ingestor' # where to store files that are compressed or remote
|
42
|
+
|
43
|
+
# How to map out the columns from document to ActiveRecord
|
44
|
+
map_attributes do |values|
|
45
|
+
{
|
46
|
+
# ... create your attributes hash here for ActiveRecord/ActiveModel/etc.
|
47
|
+
# values may be a Hash (xml, json) or an Array (csv, plain_text)
|
48
|
+
}
|
49
|
+
end
|
50
|
+
|
51
|
+
# before{ |attrs| attrs}
|
52
|
+
|
53
|
+
# Your strategy for finding or instantiating a new object to be handled by the processor block
|
54
|
+
finder do |attrs|
|
55
|
+
MyClass.find( attrs[:id] ) || MyClass.new
|
56
|
+
end
|
57
|
+
|
58
|
+
# The default processor simple calls update_attributes
|
59
|
+
# processor do |attrs,record|
|
60
|
+
# ... custom processor here ...
|
61
|
+
# end
|
62
|
+
|
63
|
+
# after { |record| record}
|
64
|
+
end
|
65
|
+
HEREDOC
|
66
|
+
end
|
67
|
+
|
68
|
+
say "Generated #{generated_file}"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
Ingestor::CommandLine.start
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'ingestor'
|
5
|
+
|
6
|
+
# Set up a bogus active model
|
7
|
+
require 'active_model'
|
8
|
+
class HotelChain
|
9
|
+
include ActiveModel::Naming
|
10
|
+
def persisted?
|
11
|
+
true
|
12
|
+
end
|
13
|
+
# Make a fake active model
|
14
|
+
attr_accessor :id, :name
|
15
|
+
def update_attributes(attributes = {})
|
16
|
+
attributes.each do |name, value|
|
17
|
+
send("#{name}=", value)
|
18
|
+
end
|
19
|
+
true
|
20
|
+
end
|
21
|
+
end
|
22
|
+
# end bogusness
|
23
|
+
|
24
|
+
ingest "https://www.ian.com/affiliatecenter/include/V2/ChainList.zip" do
|
25
|
+
parser :plain_text
|
26
|
+
compressed true
|
27
|
+
includes_header true
|
28
|
+
# sample true
|
29
|
+
|
30
|
+
parser_options delimiter: '|'
|
31
|
+
|
32
|
+
# How to map out the columns from text to AR
|
33
|
+
map_attributes do |values|
|
34
|
+
{
|
35
|
+
id: values[0],
|
36
|
+
name: values[1]
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
40
|
+
# before{|attrs| attrs}
|
41
|
+
|
42
|
+
# Your strategy for finding or instantiating a new object to be handled by the processor block
|
43
|
+
finder{|attrs|
|
44
|
+
# Book.find( attrs['id'] ) || Book.new
|
45
|
+
HotelChain.new
|
46
|
+
}
|
47
|
+
|
48
|
+
processor{|attrs,record|
|
49
|
+
# ... custom processor here ...
|
50
|
+
record.update_attributes attrs
|
51
|
+
}
|
52
|
+
|
53
|
+
after{|record|
|
54
|
+
puts "Created: #{record.name}"
|
55
|
+
}
|
56
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'ingestor'
|
5
|
+
require 'ingestor/parser/xml'
|
6
|
+
|
7
|
+
# Set up a bogus active model
|
8
|
+
require 'active_model'
|
9
|
+
class Book
|
10
|
+
include ActiveModel::Naming
|
11
|
+
def persisted?
|
12
|
+
true
|
13
|
+
end
|
14
|
+
# Make a fake active model
|
15
|
+
attr_accessor :id, :title, :author, :price, :genre, :publish_date, :description
|
16
|
+
def update_attributes(attributes = {})
|
17
|
+
attributes.each do |name, value|
|
18
|
+
send("#{name}=", value)
|
19
|
+
end
|
20
|
+
true
|
21
|
+
end
|
22
|
+
end
|
23
|
+
# end bogusness
|
24
|
+
|
25
|
+
ingest "./samples/books.xml" do
|
26
|
+
parser :xml
|
27
|
+
parser_options xpath: '//book'
|
28
|
+
#sample true
|
29
|
+
# compressed false
|
30
|
+
|
31
|
+
# How to map out the columns from text to AR
|
32
|
+
map_attributes do |values|
|
33
|
+
values['book']
|
34
|
+
end
|
35
|
+
|
36
|
+
# before{|attrs| values}
|
37
|
+
|
38
|
+
# Your strategy for finding or instantiating a new object to be handled by the processor block
|
39
|
+
finder{|attrs|
|
40
|
+
# Book.find( attrs['id'] ) || Book.new
|
41
|
+
Book.new
|
42
|
+
}
|
43
|
+
|
44
|
+
processor{|attrs,record|
|
45
|
+
# ... custom processor here ...
|
46
|
+
record.update_attributes attrs
|
47
|
+
}
|
48
|
+
|
49
|
+
after{|record|
|
50
|
+
puts "Created: #{record.title}"
|
51
|
+
}
|
52
|
+
end
|