ingestor 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +21 -0
- data/Gemfile +18 -0
- data/Guardfile +11 -0
- data/LICENSE.txt +22 -0
- data/README.md +211 -0
- data/Rakefile +7 -0
- data/bin/ingest +73 -0
- data/examples/text_parsing.rb +56 -0
- data/examples/xml_parsing.rb +52 -0
- data/ingestor.gemspec +23 -0
- data/lib/ingestor.rb +37 -0
- data/lib/ingestor/dsl.rb +110 -0
- data/lib/ingestor/parser/base.rb +28 -0
- data/lib/ingestor/parser/csv.rb +8 -0
- data/lib/ingestor/parser/json.rb +8 -0
- data/lib/ingestor/parser/plain_text.rb +44 -0
- data/lib/ingestor/parser/xml.rb +37 -0
- data/lib/ingestor/proxy.rb +113 -0
- data/lib/ingestor/tasks.rb +15 -0
- data/lib/ingestor/version.rb +3 -0
- data/samples/animals.csv +7 -0
- data/samples/books.xml +32 -0
- data/samples/colors.json +30 -0
- data/samples/flags.txt +12 -0
- data/samples/people.json +26 -0
- data/spec/cassettes/remote-zipped-files.yml +186 -0
- data/spec/lib/ingestor/dsl_spec.rb +114 -0
- data/spec/lib/ingestor/parser/csv_spec.rb +5 -0
- data/spec/lib/ingestor/parser/json_spec.rb +5 -0
- data/spec/lib/ingestor/parser/plain_text_spec.rb +24 -0
- data/spec/lib/ingestor/parser/xml_spec.rb +25 -0
- data/spec/lib/ingestor/proxy_spec.rb +129 -0
- data/spec/lib/ingestor_spec.rb +19 -0
- data/spec/orm/active_record.rb +33 -0
- data/spec/orm/database.example.yml +15 -0
- data/spec/spec_helper.rb +21 -0
- metadata +139 -0
data/.gitignore
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
log
|
14
|
+
rdoc
|
15
|
+
spec/reports
|
16
|
+
test/tmp
|
17
|
+
test/version_tmp
|
18
|
+
tmp
|
19
|
+
spec/orm/database.yml
|
20
|
+
.DS_Store
|
21
|
+
**/.DS_Store
|
data/Gemfile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in ingestor.gemspec
|
4
|
+
gemspec
|
5
|
+
gem 'rspec', '2.12.0'
|
6
|
+
gem 'rb-fsevent'
|
7
|
+
gem "guard", '1.6.1'
|
8
|
+
gem "guard-bundler"
|
9
|
+
gem "guard-rspec"
|
10
|
+
gem 'debugger', '1.2.0'
|
11
|
+
gem 'vcr'
|
12
|
+
gem 'fakeweb'
|
13
|
+
gem 'nokogiri'
|
14
|
+
gem 'activerecord', "~>3.2.0"
|
15
|
+
gem 'activesupport', "~>3.2.0"
|
16
|
+
gem 'mysql2'
|
17
|
+
gem 'rb-fsevent'
|
18
|
+
gem 'ruby_gntp'
|
data/Guardfile
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
guard 'bundler' do
|
2
|
+
watch('Gemfile')
|
3
|
+
# Uncomment next line if Gemfile contain `gemspec' command
|
4
|
+
# watch(/^.+\.gemspec/)
|
5
|
+
end
|
6
|
+
|
7
|
+
guard 'rspec', :version => 2 do
|
8
|
+
watch(%r{^spec/.+_spec\.rb$})
|
9
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
10
|
+
watch('spec/spec_helper.rb') { "spec" }
|
11
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Cory O'Daniel
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,211 @@
|
|
1
|
+
# Ingestor
|
2
|
+
|
3
|
+
A simple DSL for importing data from text and csv files to ActiveRecord. This was originally designed to
|
4
|
+
continually import changing data from EAN and Geonames.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
gem 'ingestor'
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
$ bundle install
|
15
|
+
|
16
|
+
Or install it yourself as:
|
17
|
+
|
18
|
+
$ gem install ingestor
|
19
|
+
|
20
|
+
Add the following to your Rakefile
|
21
|
+
require 'ingestor/tasks'
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
Given a text file:
|
26
|
+
|
27
|
+
id|name|population
|
28
|
+
1|China|1,354,040,000
|
29
|
+
2|India|1,210,193,422
|
30
|
+
3|United States|315,550,000
|
31
|
+
|
32
|
+
And an AR Class:
|
33
|
+
|
34
|
+
class Country
|
35
|
+
attr_accessible :name, :population
|
36
|
+
end
|
37
|
+
|
38
|
+
Sync the file with AR:
|
39
|
+
|
40
|
+
ingest("path/to/countries.txt") do
|
41
|
+
map_attributes do |values|
|
42
|
+
{
|
43
|
+
id: values[0],
|
44
|
+
name: values[1],
|
45
|
+
population: values[2]
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
# current lines values
|
50
|
+
finder{|attrs|
|
51
|
+
Country.where(id: attrs[:id]).first || Country.new
|
52
|
+
}
|
53
|
+
end
|
54
|
+
|
55
|
+
It can handle remote files and zip files as well.
|
56
|
+
|
57
|
+
ingest("http://example.com/a_lot_of_countries.zip") do
|
58
|
+
compressed true
|
59
|
+
map_attributes do |values|
|
60
|
+
{
|
61
|
+
id: values[0],
|
62
|
+
name: values[1],
|
63
|
+
population: values[2]
|
64
|
+
}
|
65
|
+
end
|
66
|
+
|
67
|
+
# current lines values
|
68
|
+
finder{|attrs|
|
69
|
+
Country.where(id: attrs[:id]).first || Country.new
|
70
|
+
}
|
71
|
+
end
|
72
|
+
|
73
|
+
It can handle XML, JSON, and more...
|
74
|
+
|
75
|
+
ingest("http://example.com/books.xml") do
|
76
|
+
parser :xml
|
77
|
+
parser_options xpath: '//book'
|
78
|
+
map_attributes do |values|
|
79
|
+
{
|
80
|
+
id: values['id'],
|
81
|
+
title: values['title'],
|
82
|
+
author: {
|
83
|
+
name: values['author']
|
84
|
+
}
|
85
|
+
}
|
86
|
+
end
|
87
|
+
|
88
|
+
# current lines values
|
89
|
+
finder{|attrs|
|
90
|
+
Book.where(id: attrs[:id]).first || Book.new
|
91
|
+
}
|
92
|
+
|
93
|
+
processor{|attrs,record|
|
94
|
+
record.update_attributes(attrs)
|
95
|
+
record.reviews.create({
|
96
|
+
stars: 5,
|
97
|
+
comment: "Every book they sell is so great!"
|
98
|
+
})
|
99
|
+
}
|
100
|
+
end
|
101
|
+
|
102
|
+
## Advanced Usage
|
103
|
+
DSL Options
|
104
|
+
* parser - the parser to use on the file
|
105
|
+
* Symbol
|
106
|
+
* Optional
|
107
|
+
* Default: :plain_text
|
108
|
+
* Available Values: :plain_text, :xml, :json, :csv, :html
|
109
|
+
* See 'Included Parsers' below
|
110
|
+
* parser_options - options for a specific parser
|
111
|
+
* Hash
|
112
|
+
* Optional
|
113
|
+
* Default: set per parser
|
114
|
+
* See 'Included Parsers' below
|
115
|
+
* sample - dump a single raw entry from the file to STDOUT and exit
|
116
|
+
* Boolean
|
117
|
+
* Optional
|
118
|
+
* Default: false
|
119
|
+
(defaults: false) will
|
120
|
+
* includes_header - Tells the parser that the first line is a header and should be ignored
|
121
|
+
* Boolean
|
122
|
+
* Optional
|
123
|
+
* Default: false
|
124
|
+
* compressed - Should the file be decompressed
|
125
|
+
* Boolean
|
126
|
+
* Optional
|
127
|
+
* Default: false
|
128
|
+
* working_directory - where to store remote or decompressed files for local processing
|
129
|
+
* String
|
130
|
+
* Optional
|
131
|
+
* Default: /tmp/ingestor
|
132
|
+
* before - callback that receives attributes for each record BEFORE call to [finder]
|
133
|
+
* Proc(attributes)
|
134
|
+
* Optional
|
135
|
+
* Default: nil
|
136
|
+
* finder - Arel finder for each object
|
137
|
+
* Proc(attributes)
|
138
|
+
* Returns: ~ActiveModel
|
139
|
+
* Required
|
140
|
+
* processor - What to do with the attributes and object
|
141
|
+
* Proc(attributes,record)
|
142
|
+
* Returns: ~ActiveModel
|
143
|
+
* Optional
|
144
|
+
* Default: Proc, calls #update_attributes on record without protection
|
145
|
+
* after - callback that receives each record after [processor]
|
146
|
+
* Proc(record)
|
147
|
+
* Optional
|
148
|
+
|
149
|
+
|
150
|
+
## Included Parsers
|
151
|
+
|
152
|
+
Writing parsers is simple ([see examples](https://github.com/coryodaniel/ingestor/tree/master/lib/ingestor/parser])).
|
153
|
+
|
154
|
+
### Plain Text Parser
|
155
|
+
Parses a plain text document.
|
156
|
+
|
157
|
+
Options
|
158
|
+
* delimiter - how to split up each line
|
159
|
+
* String
|
160
|
+
* Default: '|'
|
161
|
+
* Optional
|
162
|
+
* line\_processor - override default\_line\_processor. The default\_line\_processor simply splits the string using the delimiter
|
163
|
+
* Proc(string)
|
164
|
+
* Returns Array
|
165
|
+
* Default: nil
|
166
|
+
* Optional
|
167
|
+
|
168
|
+
### XML Parser
|
169
|
+
Parses an XML document
|
170
|
+
|
171
|
+
Options
|
172
|
+
* selector - xpath selector to get the node collection
|
173
|
+
* String
|
174
|
+
* Required
|
175
|
+
* encoding - XML Encoding. See nokogiri encoding
|
176
|
+
* String
|
177
|
+
* Optional
|
178
|
+
* Default libxml2 best guess
|
179
|
+
|
180
|
+
### JSON Parser
|
181
|
+
Coming soon...
|
182
|
+
|
183
|
+
### CSV Parser
|
184
|
+
Coming soon...
|
185
|
+
|
186
|
+
### HTML Parser
|
187
|
+
Coming soon...
|
188
|
+
|
189
|
+
|
190
|
+
## Contributing
|
191
|
+
|
192
|
+
1. Fork it
|
193
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
194
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
195
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
196
|
+
5. Create new Pull Request
|
197
|
+
|
198
|
+
## Running Tests
|
199
|
+
|
200
|
+
1. Copy spec/orm/database.example.yml => spec/orm/database.yml
|
201
|
+
2. Configure spec/orm/database.yml
|
202
|
+
3. bundle exec guard
|
203
|
+
|
204
|
+
|
205
|
+
## Todos
|
206
|
+
* rdoc lib/ folder
|
207
|
+
* Move includes_header to CSV, PlainText
|
208
|
+
* Mongoid Support
|
209
|
+
* sort/limit options
|
210
|
+
* A way to sample a file without building an ingestor first
|
211
|
+
* bin/ingestor --sample --path=./my.xml --parser xml --parser_options_xpath '//book'
|
data/Rakefile
ADDED
data/bin/ingest
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'ingestor'
|
4
|
+
require 'thor'
|
5
|
+
require 'active_support/inflector'
|
6
|
+
|
7
|
+
module Ingestor
|
8
|
+
class CommandLine < Thor
|
9
|
+
|
10
|
+
desc "generate PATH", "Create a new ingestor for PATH. PATH should be an absolute path or URL"
|
11
|
+
def generate(path)
|
12
|
+
output_directory = 'script/ingestors'
|
13
|
+
FileUtils.mkdir_p(output_directory, mode: 0755) unless Dir.exists?(output_directory)
|
14
|
+
|
15
|
+
file_name = File.basename(path).underscore.parameterize.underscore
|
16
|
+
file_name = %Q{#{Time.now.utc.strftime("%Y%m%d%H%M%S")}_#{file_name}.rb}
|
17
|
+
|
18
|
+
generated_file = File.join( output_directory, file_name )
|
19
|
+
|
20
|
+
File.open(generated_file, 'w+') do |f|
|
21
|
+
f.puts <<-HEREDOC
|
22
|
+
#! /usr/bin/env ruby
|
23
|
+
require 'ingestor'
|
24
|
+
|
25
|
+
######################################
|
26
|
+
#
|
27
|
+
# Order of block execution is:
|
28
|
+
# * map_attributes
|
29
|
+
# * before
|
30
|
+
# * finder
|
31
|
+
# * processor
|
32
|
+
# * after
|
33
|
+
#
|
34
|
+
######################################
|
35
|
+
|
36
|
+
ingest "#{path}" do
|
37
|
+
parser :plain_text
|
38
|
+
# compressed true # is the file compressed
|
39
|
+
# includes_header false
|
40
|
+
# parser_options delimiter: '|' # parser specific
|
41
|
+
# working_directory '/tmp/ingestor' # where to store files that are compressed or remote
|
42
|
+
|
43
|
+
# How to map out the columns from document to ActiveRecord
|
44
|
+
map_attributes do |values|
|
45
|
+
{
|
46
|
+
# ... create your attributes hash here for ActiveRecord/ActiveModel/etc.
|
47
|
+
# values may be a Hash (xml, json) or an Array (csv, plain_text)
|
48
|
+
}
|
49
|
+
end
|
50
|
+
|
51
|
+
# before{ |attrs| attrs}
|
52
|
+
|
53
|
+
# Your strategy for finding or instantiating a new object to be handled by the processor block
|
54
|
+
finder do |attrs|
|
55
|
+
MyClass.find( attrs[:id] ) || MyClass.new
|
56
|
+
end
|
57
|
+
|
58
|
+
# The default processor simple calls update_attributes
|
59
|
+
# processor do |attrs,record|
|
60
|
+
# ... custom processor here ...
|
61
|
+
# end
|
62
|
+
|
63
|
+
# after { |record| record}
|
64
|
+
end
|
65
|
+
HEREDOC
|
66
|
+
end
|
67
|
+
|
68
|
+
say "Generated #{generated_file}"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
Ingestor::CommandLine.start
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'ingestor'
|
5
|
+
|
6
|
+
# Set up a bogus active model
|
7
|
+
require 'active_model'
|
8
|
+
class HotelChain
|
9
|
+
include ActiveModel::Naming
|
10
|
+
def persisted?
|
11
|
+
true
|
12
|
+
end
|
13
|
+
# Make a fake active model
|
14
|
+
attr_accessor :id, :name
|
15
|
+
def update_attributes(attributes = {})
|
16
|
+
attributes.each do |name, value|
|
17
|
+
send("#{name}=", value)
|
18
|
+
end
|
19
|
+
true
|
20
|
+
end
|
21
|
+
end
|
22
|
+
# end bogusness
|
23
|
+
|
24
|
+
ingest "https://www.ian.com/affiliatecenter/include/V2/ChainList.zip" do
|
25
|
+
parser :plain_text
|
26
|
+
compressed true
|
27
|
+
includes_header true
|
28
|
+
# sample true
|
29
|
+
|
30
|
+
parser_options delimiter: '|'
|
31
|
+
|
32
|
+
# How to map out the columns from text to AR
|
33
|
+
map_attributes do |values|
|
34
|
+
{
|
35
|
+
id: values[0],
|
36
|
+
name: values[1]
|
37
|
+
}
|
38
|
+
end
|
39
|
+
|
40
|
+
# before{|attrs| attrs}
|
41
|
+
|
42
|
+
# Your strategy for finding or instantiating a new object to be handled by the processor block
|
43
|
+
finder{|attrs|
|
44
|
+
# Book.find( attrs['id'] ) || Book.new
|
45
|
+
HotelChain.new
|
46
|
+
}
|
47
|
+
|
48
|
+
processor{|attrs,record|
|
49
|
+
# ... custom processor here ...
|
50
|
+
record.update_attributes attrs
|
51
|
+
}
|
52
|
+
|
53
|
+
after{|record|
|
54
|
+
puts "Created: #{record.name}"
|
55
|
+
}
|
56
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'ingestor'
|
5
|
+
require 'ingestor/parser/xml'
|
6
|
+
|
7
|
+
# Set up a bogus active model
|
8
|
+
require 'active_model'
|
9
|
+
class Book
|
10
|
+
include ActiveModel::Naming
|
11
|
+
def persisted?
|
12
|
+
true
|
13
|
+
end
|
14
|
+
# Make a fake active model
|
15
|
+
attr_accessor :id, :title, :author, :price, :genre, :publish_date, :description
|
16
|
+
def update_attributes(attributes = {})
|
17
|
+
attributes.each do |name, value|
|
18
|
+
send("#{name}=", value)
|
19
|
+
end
|
20
|
+
true
|
21
|
+
end
|
22
|
+
end
|
23
|
+
# end bogusness
|
24
|
+
|
25
|
+
ingest "./samples/books.xml" do
|
26
|
+
parser :xml
|
27
|
+
parser_options xpath: '//book'
|
28
|
+
#sample true
|
29
|
+
# compressed false
|
30
|
+
|
31
|
+
# How to map out the columns from text to AR
|
32
|
+
map_attributes do |values|
|
33
|
+
values['book']
|
34
|
+
end
|
35
|
+
|
36
|
+
# before{|attrs| values}
|
37
|
+
|
38
|
+
# Your strategy for finding or instantiating a new object to be handled by the processor block
|
39
|
+
finder{|attrs|
|
40
|
+
# Book.find( attrs['id'] ) || Book.new
|
41
|
+
Book.new
|
42
|
+
}
|
43
|
+
|
44
|
+
processor{|attrs,record|
|
45
|
+
# ... custom processor here ...
|
46
|
+
record.update_attributes attrs
|
47
|
+
}
|
48
|
+
|
49
|
+
after{|record|
|
50
|
+
puts "Created: #{record.title}"
|
51
|
+
}
|
52
|
+
end
|