ingestor 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ log
14
+ rdoc
15
+ spec/reports
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
19
+ spec/orm/database.yml
20
+ .DS_Store
21
+ **/.DS_Store
data/Gemfile ADDED
@@ -0,0 +1,18 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in ingestor.gemspec
4
+ gemspec
5
+ gem 'rspec', '2.12.0'
6
+ gem 'rb-fsevent'
7
+ gem "guard", '1.6.1'
8
+ gem "guard-bundler"
9
+ gem "guard-rspec"
10
+ gem 'debugger', '1.2.0'
11
+ gem 'vcr'
12
+ gem 'fakeweb'
13
+ gem 'nokogiri'
14
+ gem 'activerecord', "~>3.2.0"
15
+ gem 'activesupport', "~>3.2.0"
16
+ gem 'mysql2'
17
+ gem 'rb-fsevent'
18
+ gem 'ruby_gntp'
data/Guardfile ADDED
@@ -0,0 +1,11 @@
1
+ guard 'bundler' do
2
+ watch('Gemfile')
3
+ # Uncomment next line if Gemfile contain `gemspec' command
4
+ # watch(/^.+\.gemspec/)
5
+ end
6
+
7
+ guard 'rspec', :version => 2 do
8
+ watch(%r{^spec/.+_spec\.rb$})
9
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
10
+ watch('spec/spec_helper.rb') { "spec" }
11
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Cory O'Daniel
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,211 @@
1
+ # Ingestor
2
+
3
+ A simple DSL for importing data from text and csv files to ActiveRecord. This was originally designed to
4
+ continually import changing data from EAN and Geonames.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ gem 'ingestor'
11
+
12
+ And then execute:
13
+
14
+ $ bundle install
15
+
16
+ Or install it yourself as:
17
+
18
+ $ gem install ingestor
19
+
20
+ Add the following to your Rakefile
21
+ require 'ingestor/tasks'
22
+
23
+ ## Usage
24
+
25
+ Given a text file:
26
+
27
+ id|name|population
28
+ 1|China|1,354,040,000
29
+ 2|India|1,210,193,422
30
+ 3|United States|315,550,000
31
+
32
+ And an AR Class:
33
+
34
+ class Country
35
+ attr_accessible :name, :population
36
+ end
37
+
38
+ Sync the file with AR:
39
+
40
+ ingest("path/to/countries.txt") do
41
+ map_attributes do |values|
42
+ {
43
+ id: values[0],
44
+ name: values[1],
45
+ population: values[2]
46
+ }
47
+ end
48
+
49
+ # current lines values
50
+ finder{|attrs|
51
+ Country.where(id: attrs[:id]).first || Country.new
52
+ }
53
+ end
54
+
55
+ It can handle remote files and zip files as well.
56
+
57
+ ingest("http://example.com/a_lot_of_countries.zip") do
58
+ compressed true
59
+ map_attributes do |values|
60
+ {
61
+ id: values[0],
62
+ name: values[1],
63
+ population: values[2]
64
+ }
65
+ end
66
+
67
+ # current lines values
68
+ finder{|attrs|
69
+ Country.where(id: attrs[:id]).first || Country.new
70
+ }
71
+ end
72
+
73
+ It can handle XML, JSON, and more...
74
+
75
+ ingest("http://example.com/books.xml") do
76
+ parser :xml
77
+ parser_options xpath: '//book'
78
+ map_attributes do |values|
79
+ {
80
+ id: values['id'],
81
+ title: values['title'],
82
+ author: {
83
+ name: values['author']
84
+ }
85
+ }
86
+ end
87
+
88
+ # current lines values
89
+ finder{|attrs|
90
+ Book.where(id: attrs[:id]).first || Book.new
91
+ }
92
+
93
+ processor{|attrs,record|
94
+ record.update_attributes(attrs)
95
+ record.reviews.create({
96
+ stars: 5,
97
+ comment: "Every book they sell is so great!"
98
+ })
99
+ }
100
+ end
101
+
102
+ ## Advanced Usage
103
+ DSL Options
104
+ * parser - the parser to use on the file
105
+ * Symbol
106
+ * Optional
107
+ * Default: :plain_text
108
+ * Available Values: :plain_text, :xml, :json, :csv, :html
109
+ * See 'Included Parsers' below
110
+ * parser_options - options for a specific parser
111
+ * Hash
112
+ * Optional
113
+ * Default: set per parser
114
+ * See 'Included Parsers' below
115
+ * sample - dump a single raw entry from the file to STDOUT and exit
116
+ * Boolean
117
+ * Optional
118
+ * Default: false
119
+ (defaults: false) will
120
+ * includes_header - Tells the parser that the first line is a header and should be ignored
121
+ * Boolean
122
+ * Optional
123
+ * Default: false
124
+ * compressed - Should the file be decompressed
125
+ * Boolean
126
+ * Optional
127
+ * Default: false
128
+ * working_directory - where to store remote or decompressed files for local processing
129
+ * String
130
+ * Optional
131
+ * Default: /tmp/ingestor
132
+ * before - callback that receives attributes for each record BEFORE call to [finder]
133
+ * Proc(attributes)
134
+ * Optional
135
+ * Default: nil
136
+ * finder - Arel finder for each object
137
+ * Proc(attributes)
138
+ * Returns: ~ActiveModel
139
+ * Required
140
+ * processor - What to do with the attributes and object
141
+ * Proc(attributes,record)
142
+ * Returns: ~ActiveModel
143
+ * Optional
144
+ * Default: Proc, calls #update_attributes on record without protection
145
+ * after - callback that receives each record after [processor]
146
+ * Proc(record)
147
+ * Optional
148
+
149
+
150
+ ## Included Parsers
151
+
152
+ Writing parsers is simple ([see examples](https://github.com/coryodaniel/ingestor/tree/master/lib/ingestor/parser])).
153
+
154
+ ### Plain Text Parser
155
+ Parses a plain text document.
156
+
157
+ Options
158
+ * delimiter - how to split up each line
159
+ * String
160
+ * Default: '|'
161
+ * Optional
162
+ * line\_processor - override default\_line\_processor. The default\_line\_processor simply splits the string using the delimiter
163
+ * Proc(string)
164
+ * Returns Array
165
+ * Default: nil
166
+ * Optional
167
+
168
+ ### XML Parser
169
+ Parses an XML document
170
+
171
+ Options
172
+ * selector - xpath selector to get the node collection
173
+ * String
174
+ * Required
175
+ * encoding - XML Encoding. See nokogiri encoding
176
+ * String
177
+ * Optional
178
+ * Default libxml2 best guess
179
+
180
+ ### JSON Parser
181
+ Coming soon...
182
+
183
+ ### CSV Parser
184
+ Coming soon...
185
+
186
+ ### HTML Parser
187
+ Coming soon...
188
+
189
+
190
+ ## Contributing
191
+
192
+ 1. Fork it
193
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
194
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
195
+ 4. Push to the branch (`git push origin my-new-feature`)
196
+ 5. Create new Pull Request
197
+
198
+ ## Running Tests
199
+
200
+ 1. Copy spec/orm/database.example.yml => spec/orm/database.yml
201
+ 2. Configure spec/orm/database.yml
202
+ 3. bundle exec guard
203
+
204
+
205
+ ## Todos
206
+ * rdoc lib/ folder
207
+ * Move includes_header to CSV, PlainText
208
+ * Mongoid Support
209
+ * sort/limit options
210
+ * A way to sample a file without building an ingestor first
211
+ * bin/ingestor --sample --path=./my.xml --parser xml --parser_options_xpath '//book'
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new('spec')
5
+
6
+ # If you want to make this the default task
7
+ task :default => :spec
data/bin/ingest ADDED
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'ingestor'
4
+ require 'thor'
5
+ require 'active_support/inflector'
6
+
7
+ module Ingestor
8
+ class CommandLine < Thor
9
+
10
+ desc "generate PATH", "Create a new ingestor for PATH. PATH should be an absolute path or URL"
11
+ def generate(path)
12
+ output_directory = 'script/ingestors'
13
+ FileUtils.mkdir_p(output_directory, mode: 0755) unless Dir.exists?(output_directory)
14
+
15
+ file_name = File.basename(path).underscore.parameterize.underscore
16
+ file_name = %Q{#{Time.now.utc.strftime("%Y%m%d%H%M%S")}_#{file_name}.rb}
17
+
18
+ generated_file = File.join( output_directory, file_name )
19
+
20
+ File.open(generated_file, 'w+') do |f|
21
+ f.puts <<-HEREDOC
22
+ #! /usr/bin/env ruby
23
+ require 'ingestor'
24
+
25
+ ######################################
26
+ #
27
+ # Order of block execution is:
28
+ # * map_attributes
29
+ # * before
30
+ # * finder
31
+ # * processor
32
+ # * after
33
+ #
34
+ ######################################
35
+
36
+ ingest "#{path}" do
37
+ parser :plain_text
38
+ # compressed true # is the file compressed
39
+ # includes_header false
40
+ # parser_options delimiter: '|' # parser specific
41
+ # working_directory '/tmp/ingestor' # where to store files that are compressed or remote
42
+
43
+ # How to map out the columns from document to ActiveRecord
44
+ map_attributes do |values|
45
+ {
46
+ # ... create your attributes hash here for ActiveRecord/ActiveModel/etc.
47
+ # values may be a Hash (xml, json) or an Array (csv, plain_text)
48
+ }
49
+ end
50
+
51
+ # before{ |attrs| attrs}
52
+
53
+ # Your strategy for finding or instantiating a new object to be handled by the processor block
54
+ finder do |attrs|
55
+ MyClass.find( attrs[:id] ) || MyClass.new
56
+ end
57
+
58
+ # The default processor simple calls update_attributes
59
+ # processor do |attrs,record|
60
+ # ... custom processor here ...
61
+ # end
62
+
63
+ # after { |record| record}
64
+ end
65
+ HEREDOC
66
+ end
67
+
68
+ say "Generated #{generated_file}"
69
+ end
70
+ end
71
+ end
72
+
73
+ Ingestor::CommandLine.start
@@ -0,0 +1,56 @@
1
+ #! /usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'bundler/setup'
4
+ require 'ingestor'
5
+
6
+ # Set up a bogus active model
7
+ require 'active_model'
8
+ class HotelChain
9
+ include ActiveModel::Naming
10
+ def persisted?
11
+ true
12
+ end
13
+ # Make a fake active model
14
+ attr_accessor :id, :name
15
+ def update_attributes(attributes = {})
16
+ attributes.each do |name, value|
17
+ send("#{name}=", value)
18
+ end
19
+ true
20
+ end
21
+ end
22
+ # end bogusness
23
+
24
+ ingest "https://www.ian.com/affiliatecenter/include/V2/ChainList.zip" do
25
+ parser :plain_text
26
+ compressed true
27
+ includes_header true
28
+ # sample true
29
+
30
+ parser_options delimiter: '|'
31
+
32
+ # How to map out the columns from text to AR
33
+ map_attributes do |values|
34
+ {
35
+ id: values[0],
36
+ name: values[1]
37
+ }
38
+ end
39
+
40
+ # before{|attrs| attrs}
41
+
42
+ # Your strategy for finding or instantiating a new object to be handled by the processor block
43
+ finder{|attrs|
44
+ # Book.find( attrs['id'] ) || Book.new
45
+ HotelChain.new
46
+ }
47
+
48
+ processor{|attrs,record|
49
+ # ... custom processor here ...
50
+ record.update_attributes attrs
51
+ }
52
+
53
+ after{|record|
54
+ puts "Created: #{record.name}"
55
+ }
56
+ end
@@ -0,0 +1,52 @@
1
+ #! /usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'bundler/setup'
4
+ require 'ingestor'
5
+ require 'ingestor/parser/xml'
6
+
7
+ # Set up a bogus active model
8
+ require 'active_model'
9
+ class Book
10
+ include ActiveModel::Naming
11
+ def persisted?
12
+ true
13
+ end
14
+ # Make a fake active model
15
+ attr_accessor :id, :title, :author, :price, :genre, :publish_date, :description
16
+ def update_attributes(attributes = {})
17
+ attributes.each do |name, value|
18
+ send("#{name}=", value)
19
+ end
20
+ true
21
+ end
22
+ end
23
+ # end bogusness
24
+
25
+ ingest "./samples/books.xml" do
26
+ parser :xml
27
+ parser_options xpath: '//book'
28
+ #sample true
29
+ # compressed false
30
+
31
+ # How to map out the columns from text to AR
32
+ map_attributes do |values|
33
+ values['book']
34
+ end
35
+
36
+ # before{|attrs| values}
37
+
38
+ # Your strategy for finding or instantiating a new object to be handled by the processor block
39
+ finder{|attrs|
40
+ # Book.find( attrs['id'] ) || Book.new
41
+ Book.new
42
+ }
43
+
44
+ processor{|attrs,record|
45
+ # ... custom processor here ...
46
+ record.update_attributes attrs
47
+ }
48
+
49
+ after{|record|
50
+ puts "Created: #{record.title}"
51
+ }
52
+ end