ingestor 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ log
14
+ rdoc
15
+ spec/reports
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
19
+ spec/orm/database.yml
20
+ .DS_Store
21
+ **/.DS_Store
data/Gemfile ADDED
@@ -0,0 +1,18 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in ingestor.gemspec
4
+ gemspec
5
+ gem 'rspec', '2.12.0'
6
+ gem 'rb-fsevent'
7
+ gem "guard", '1.6.1'
8
+ gem "guard-bundler"
9
+ gem "guard-rspec"
10
+ gem 'debugger', '1.2.0'
11
+ gem 'vcr'
12
+ gem 'fakeweb'
13
+ gem 'nokogiri'
14
+ gem 'activerecord', "~>3.2.0"
15
+ gem 'activesupport', "~>3.2.0"
16
+ gem 'mysql2'
17
+ gem 'rb-fsevent'
18
+ gem 'ruby_gntp'
data/Guardfile ADDED
@@ -0,0 +1,11 @@
1
+ guard 'bundler' do
2
+ watch('Gemfile')
3
+ # Uncomment next line if Gemfile contain `gemspec' command
4
+ # watch(/^.+\.gemspec/)
5
+ end
6
+
7
+ guard 'rspec', :version => 2 do
8
+ watch(%r{^spec/.+_spec\.rb$})
9
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
10
+ watch('spec/spec_helper.rb') { "spec" }
11
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Cory O'Daniel
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,211 @@
1
+ # Ingestor
2
+
3
+ A simple DSL for importing data from text and csv files to ActiveRecord. This was originally designed to
4
+ continually import changing data from EAN and Geonames.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ gem 'ingestor'
11
+
12
+ And then execute:
13
+
14
+ $ bundle install
15
+
16
+ Or install it yourself as:
17
+
18
+ $ gem install ingestor
19
+
20
+ Add the following to your Rakefile
21
+ require 'ingestor/tasks'
22
+
23
+ ## Usage
24
+
25
+ Given a text file:
26
+
27
+ id|name|population
28
+ 1|China|1,354,040,000
29
+ 2|India|1,210,193,422
30
+ 3|United States|315,550,000
31
+
32
+ And an AR Class:
33
+
34
+ class Country
35
+ attr_accessible :name, :population
36
+ end
37
+
38
+ Sync the file with AR:
39
+
40
+ ingest("path/to/countries.txt") do
41
+ map_attributes do |values|
42
+ {
43
+ id: values[0],
44
+ name: values[1],
45
+ population: values[2]
46
+ }
47
+ end
48
+
49
+ # current lines values
50
+ finder{|attrs|
51
+ Country.where(id: attrs[:id]).first || Country.new
52
+ }
53
+ end
54
+
55
+ It can handle remote files and zip files as well.
56
+
57
+ ingest("http://example.com/a_lot_of_countries.zip") do
58
+ compressed true
59
+ map_attributes do |values|
60
+ {
61
+ id: values[0],
62
+ name: values[1],
63
+ population: values[2]
64
+ }
65
+ end
66
+
67
+ # current lines values
68
+ finder{|attrs|
69
+ Country.where(id: attrs[:id]).first || Country.new
70
+ }
71
+ end
72
+
73
+ It can handle XML, JSON, and more...
74
+
75
+ ingest("http://example.com/books.xml") do
76
+ parser :xml
77
+ parser_options xpath: '//book'
78
+ map_attributes do |values|
79
+ {
80
+ id: values['id'],
81
+ title: values['title'],
82
+ author: {
83
+ name: values['author']
84
+ }
85
+ }
86
+ end
87
+
88
+ # current lines values
89
+ finder{|attrs|
90
+ Book.where(id: attrs[:id]).first || Book.new
91
+ }
92
+
93
+ processor{|attrs,record|
94
+ record.update_attributes(attrs)
95
+ record.reviews.create({
96
+ stars: 5,
97
+ comment: "Every book they sell is so great!"
98
+ })
99
+ }
100
+ end
101
+
102
+ ## Advanced Usage
103
+ DSL Options
104
+ * parser - the parser to use on the file
105
+ * Symbol
106
+ * Optional
107
+ * Default: :plain_text
108
+ * Available Values: :plain_text, :xml, :json, :csv, :html
109
+ * See 'Included Parsers' below
110
+ * parser_options - options for a specific parser
111
+ * Hash
112
+ * Optional
113
+ * Default: set per parser
114
+ * See 'Included Parsers' below
115
+ * sample - dump a single raw entry from the file to STDOUT and exit
116
+ * Boolean
117
+ * Optional
118
+ * Default: false
119
+ (defaults: false) will
120
+ * includes_header - Tells the parser that the first line is a header and should be ignored
121
+ * Boolean
122
+ * Optional
123
+ * Default: false
124
+ * compressed - Should the file be decompressed
125
+ * Boolean
126
+ * Optional
127
+ * Default: false
128
+ * working_directory - where to store remote or decompressed files for local processing
129
+ * String
130
+ * Optional
131
+ * Default: /tmp/ingestor
132
+ * before - callback that receives attributes for each record BEFORE call to [finder]
133
+ * Proc(attributes)
134
+ * Optional
135
+ * Default: nil
136
+ * finder - Arel finder for each object
137
+ * Proc(attributes)
138
+ * Returns: ~ActiveModel
139
+ * Required
140
+ * processor - What to do with the attributes and object
141
+ * Proc(attributes,record)
142
+ * Returns: ~ActiveModel
143
+ * Optional
144
+ * Default: Proc, calls #update_attributes on record without protection
145
+ * after - callback that receives each record after [processor]
146
+ * Proc(record)
147
+ * Optional
148
+
149
+
150
+ ## Included Parsers
151
+
152
+ Writing parsers is simple ([see examples](https://github.com/coryodaniel/ingestor/tree/master/lib/ingestor/parser])).
153
+
154
+ ### Plain Text Parser
155
+ Parses a plain text document.
156
+
157
+ Options
158
+ * delimiter - how to split up each line
159
+ * String
160
+ * Default: '|'
161
+ * Optional
162
+ * line\_processor - override default\_line\_processor. The default\_line\_processor simply splits the string using the delimiter
163
+ * Proc(string)
164
+ * Returns Array
165
+ * Default: nil
166
+ * Optional
167
+
168
+ ### XML Parser
169
+ Parses an XML document
170
+
171
+ Options
172
+ * selector - xpath selector to get the node collection
173
+ * String
174
+ * Required
175
+ * encoding - XML Encoding. See nokogiri encoding
176
+ * String
177
+ * Optional
178
+ * Default libxml2 best guess
179
+
180
+ ### JSON Parser
181
+ Coming soon...
182
+
183
+ ### CSV Parser
184
+ Coming soon...
185
+
186
+ ### HTML Parser
187
+ Coming soon...
188
+
189
+
190
+ ## Contributing
191
+
192
+ 1. Fork it
193
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
194
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
195
+ 4. Push to the branch (`git push origin my-new-feature`)
196
+ 5. Create new Pull Request
197
+
198
+ ## Running Tests
199
+
200
+ 1. Copy spec/orm/database.example.yml => spec/orm/database.yml
201
+ 2. Configure spec/orm/database.yml
202
+ 3. bundle exec guard
203
+
204
+
205
+ ## Todos
206
+ * rdoc lib/ folder
207
+ * Move includes_header to CSV, PlainText
208
+ * Mongoid Support
209
+ * sort/limit options
210
+ * A way to sample a file without building an ingestor first
211
+ * bin/ingestor --sample --path=./my.xml --parser xml --parser_options_xpath '//book'
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new('spec')
5
+
6
+ # If you want to make this the default task
7
+ task :default => :spec
data/bin/ingest ADDED
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'ingestor'
4
+ require 'thor'
5
+ require 'active_support/inflector'
6
+
7
+ module Ingestor
8
+ class CommandLine < Thor
9
+
10
+ desc "generate PATH", "Create a new ingestor for PATH. PATH should be an absolute path or URL"
11
+ def generate(path)
12
+ output_directory = 'script/ingestors'
13
+ FileUtils.mkdir_p(output_directory, mode: 0755) unless Dir.exists?(output_directory)
14
+
15
+ file_name = File.basename(path).underscore.parameterize.underscore
16
+ file_name = %Q{#{Time.now.utc.strftime("%Y%m%d%H%M%S")}_#{file_name}.rb}
17
+
18
+ generated_file = File.join( output_directory, file_name )
19
+
20
+ File.open(generated_file, 'w+') do |f|
21
+ f.puts <<-HEREDOC
22
+ #! /usr/bin/env ruby
23
+ require 'ingestor'
24
+
25
+ ######################################
26
+ #
27
+ # Order of block execution is:
28
+ # * map_attributes
29
+ # * before
30
+ # * finder
31
+ # * processor
32
+ # * after
33
+ #
34
+ ######################################
35
+
36
+ ingest "#{path}" do
37
+ parser :plain_text
38
+ # compressed true # is the file compressed
39
+ # includes_header false
40
+ # parser_options delimiter: '|' # parser specific
41
+ # working_directory '/tmp/ingestor' # where to store files that are compressed or remote
42
+
43
+ # How to map out the columns from document to ActiveRecord
44
+ map_attributes do |values|
45
+ {
46
+ # ... create your attributes hash here for ActiveRecord/ActiveModel/etc.
47
+ # values may be a Hash (xml, json) or an Array (csv, plain_text)
48
+ }
49
+ end
50
+
51
+ # before{ |attrs| attrs}
52
+
53
+ # Your strategy for finding or instantiating a new object to be handled by the processor block
54
+ finder do |attrs|
55
+ MyClass.find( attrs[:id] ) || MyClass.new
56
+ end
57
+
58
+ # The default processor simple calls update_attributes
59
+ # processor do |attrs,record|
60
+ # ... custom processor here ...
61
+ # end
62
+
63
+ # after { |record| record}
64
+ end
65
+ HEREDOC
66
+ end
67
+
68
+ say "Generated #{generated_file}"
69
+ end
70
+ end
71
+ end
72
+
73
+ Ingestor::CommandLine.start
@@ -0,0 +1,56 @@
1
+ #! /usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'bundler/setup'
4
+ require 'ingestor'
5
+
6
+ # Set up a bogus active model
7
+ require 'active_model'
8
+ class HotelChain
9
+ include ActiveModel::Naming
10
+ def persisted?
11
+ true
12
+ end
13
+ # Make a fake active model
14
+ attr_accessor :id, :name
15
+ def update_attributes(attributes = {})
16
+ attributes.each do |name, value|
17
+ send("#{name}=", value)
18
+ end
19
+ true
20
+ end
21
+ end
22
+ # end bogusness
23
+
24
+ ingest "https://www.ian.com/affiliatecenter/include/V2/ChainList.zip" do
25
+ parser :plain_text
26
+ compressed true
27
+ includes_header true
28
+ # sample true
29
+
30
+ parser_options delimiter: '|'
31
+
32
+ # How to map out the columns from text to AR
33
+ map_attributes do |values|
34
+ {
35
+ id: values[0],
36
+ name: values[1]
37
+ }
38
+ end
39
+
40
+ # before{|attrs| attrs}
41
+
42
+ # Your strategy for finding or instantiating a new object to be handled by the processor block
43
+ finder{|attrs|
44
+ # Book.find( attrs['id'] ) || Book.new
45
+ HotelChain.new
46
+ }
47
+
48
+ processor{|attrs,record|
49
+ # ... custom processor here ...
50
+ record.update_attributes attrs
51
+ }
52
+
53
+ after{|record|
54
+ puts "Created: #{record.name}"
55
+ }
56
+ end
@@ -0,0 +1,52 @@
1
+ #! /usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'bundler/setup'
4
+ require 'ingestor'
5
+ require 'ingestor/parser/xml'
6
+
7
+ # Set up a bogus active model
8
+ require 'active_model'
9
+ class Book
10
+ include ActiveModel::Naming
11
+ def persisted?
12
+ true
13
+ end
14
+ # Make a fake active model
15
+ attr_accessor :id, :title, :author, :price, :genre, :publish_date, :description
16
+ def update_attributes(attributes = {})
17
+ attributes.each do |name, value|
18
+ send("#{name}=", value)
19
+ end
20
+ true
21
+ end
22
+ end
23
+ # end bogusness
24
+
25
+ ingest "./samples/books.xml" do
26
+ parser :xml
27
+ parser_options xpath: '//book'
28
+ #sample true
29
+ # compressed false
30
+
31
+ # How to map out the columns from text to AR
32
+ map_attributes do |values|
33
+ values['book']
34
+ end
35
+
36
+ # before{|attrs| values}
37
+
38
+ # Your strategy for finding or instantiating a new object to be handled by the processor block
39
+ finder{|attrs|
40
+ # Book.find( attrs['id'] ) || Book.new
41
+ Book.new
42
+ }
43
+
44
+ processor{|attrs,record|
45
+ # ... custom processor here ...
46
+ record.update_attributes attrs
47
+ }
48
+
49
+ after{|record|
50
+ puts "Created: #{record.title}"
51
+ }
52
+ end