ingestor 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +21 -0
- data/Gemfile +18 -0
- data/Guardfile +11 -0
- data/LICENSE.txt +22 -0
- data/README.md +211 -0
- data/Rakefile +7 -0
- data/bin/ingest +73 -0
- data/examples/text_parsing.rb +56 -0
- data/examples/xml_parsing.rb +52 -0
- data/ingestor.gemspec +23 -0
- data/lib/ingestor.rb +37 -0
- data/lib/ingestor/dsl.rb +110 -0
- data/lib/ingestor/parser/base.rb +28 -0
- data/lib/ingestor/parser/csv.rb +8 -0
- data/lib/ingestor/parser/json.rb +8 -0
- data/lib/ingestor/parser/plain_text.rb +44 -0
- data/lib/ingestor/parser/xml.rb +37 -0
- data/lib/ingestor/proxy.rb +113 -0
- data/lib/ingestor/tasks.rb +15 -0
- data/lib/ingestor/version.rb +3 -0
- data/samples/animals.csv +7 -0
- data/samples/books.xml +32 -0
- data/samples/colors.json +30 -0
- data/samples/flags.txt +12 -0
- data/samples/people.json +26 -0
- data/spec/cassettes/remote-zipped-files.yml +186 -0
- data/spec/lib/ingestor/dsl_spec.rb +114 -0
- data/spec/lib/ingestor/parser/csv_spec.rb +5 -0
- data/spec/lib/ingestor/parser/json_spec.rb +5 -0
- data/spec/lib/ingestor/parser/plain_text_spec.rb +24 -0
- data/spec/lib/ingestor/parser/xml_spec.rb +25 -0
- data/spec/lib/ingestor/proxy_spec.rb +129 -0
- data/spec/lib/ingestor_spec.rb +19 -0
- data/spec/orm/active_record.rb +33 -0
- data/spec/orm/database.example.yml +15 -0
- data/spec/spec_helper.rb +21 -0
- metadata +139 -0
data/ingestor.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'ingestor/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "ingestor"
|
8
|
+
gem.version = Ingestor::VERSION
|
9
|
+
gem.authors = ["Cory O'Daniel"]
|
10
|
+
gem.email = ["github@coryodaniel.com"]
|
11
|
+
gem.description = "Ingesting local and remote data files into ActiveRecord"
|
12
|
+
gem.summary = "Ingesting local and remote data files into ActiveRecord"
|
13
|
+
gem.homepage = "http://github.com/coryodaniel/ingestor"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
gem.add_dependency "docile"
|
20
|
+
gem.add_dependency "rubyzip"
|
21
|
+
gem.add_dependency "thor"
|
22
|
+
gem.add_dependency "activesupport", '>= 3.2.0'
|
23
|
+
end
|
data/lib/ingestor.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'docile'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'csv'
|
4
|
+
require 'logger'
|
5
|
+
require 'zip/zipfilesystem'
|
6
|
+
require 'ingestor/version'
|
7
|
+
require 'ingestor/proxy'
|
8
|
+
require 'ingestor/dsl'
|
9
|
+
|
10
|
+
#require 'debugger'
|
11
|
+
|
12
|
+
module Ingestor
|
13
|
+
LOG = Logger.new(STDOUT)
|
14
|
+
LOG.level = Logger::WARN
|
15
|
+
class << self
|
16
|
+
def parsers
|
17
|
+
@parsers ||= {}
|
18
|
+
end
|
19
|
+
def register_parser(kind, klass)
|
20
|
+
parsers[kind] = klass
|
21
|
+
end
|
22
|
+
|
23
|
+
def parser_for(kind)
|
24
|
+
raise Exception, "No parser for type #{kind}" if parsers[kind].nil?
|
25
|
+
parsers[kind]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def ingest(filename, &block)
|
31
|
+
options = Ingestor::Dsl.new
|
32
|
+
options.file = filename
|
33
|
+
proxy = Docile.dsl_eval(options, &block).build.start!
|
34
|
+
end
|
35
|
+
|
36
|
+
require 'ingestor/parser/base'
|
37
|
+
require 'ingestor/parser/plain_text'
|
data/lib/ingestor/dsl.rb
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
module Ingestor
|
2
|
+
class Dsl
|
3
|
+
class InvalidBlockSpecification < Exception;end;
|
4
|
+
def initialize(*args)
|
5
|
+
@options = {}
|
6
|
+
|
7
|
+
includes_header(false)
|
8
|
+
compressed(false)
|
9
|
+
parser :plain_text
|
10
|
+
parser_options({})
|
11
|
+
working_directory '/tmp/ingestor'
|
12
|
+
end
|
13
|
+
|
14
|
+
def options
|
15
|
+
@options
|
16
|
+
end
|
17
|
+
|
18
|
+
# the file to retrieve
|
19
|
+
def file=(v); @file = v;end;
|
20
|
+
|
21
|
+
# When set to true sample will get the file and print out the first
|
22
|
+
# set of raw values
|
23
|
+
def sample(v)
|
24
|
+
@options[:sample] = v
|
25
|
+
end
|
26
|
+
|
27
|
+
# where the file will be moved locally for processing when it is compressed or a remote file.
|
28
|
+
# local files will not use working directory
|
29
|
+
def working_directory(v)
|
30
|
+
@options[:working_directory] = v
|
31
|
+
end
|
32
|
+
|
33
|
+
# set parser, default :plain_text
|
34
|
+
def parser(v)
|
35
|
+
@options[:parser] = v
|
36
|
+
end
|
37
|
+
|
38
|
+
# set options
|
39
|
+
def parser_options(v)
|
40
|
+
@options[:parser_options] = v
|
41
|
+
end
|
42
|
+
|
43
|
+
# skip first line?
|
44
|
+
def includes_header(v); @options[:includes_header] = v;end;
|
45
|
+
|
46
|
+
# if the remote file is compressed, this will decompress it.
|
47
|
+
def compressed(v); @options[:compressed] = v;end;
|
48
|
+
|
49
|
+
# Takes an array of values (a line/entry/node) and should return an
|
50
|
+
# ActiveModel type object
|
51
|
+
#
|
52
|
+
# You do not need to set the attributes here, than is handled by #processor
|
53
|
+
#
|
54
|
+
# update or create:
|
55
|
+
# finder{|attrs| User.where(id: attrs[:id]).first || User.new}
|
56
|
+
#
|
57
|
+
# create:
|
58
|
+
# finder{|attrs| User.new}
|
59
|
+
# @required
|
60
|
+
def finder(&block)
|
61
|
+
if !block_given? || block.arity != 1
|
62
|
+
raise InvalidBlockSpecification, "finder proc should have an arity of 1 (Array: values)"
|
63
|
+
end
|
64
|
+
@options[:finder] = block
|
65
|
+
end
|
66
|
+
|
67
|
+
# How to process an entry in a file. The default takes the values and passes them to the record returned
|
68
|
+
# by your finder and calls update attributes
|
69
|
+
# Proc should receive two parameters
|
70
|
+
# attrs - Hash, mapped attributs for this record
|
71
|
+
# record - ~ActiveRecord:Base, record found by #finder
|
72
|
+
def processor(&block)
|
73
|
+
if !block_given? || block.arity != 2
|
74
|
+
raise InvalidBlockSpecification, "processor proc should have an arity of 2 (Array: values, ~ActiveRecord: record)"
|
75
|
+
end
|
76
|
+
@options[:processor] = block
|
77
|
+
end
|
78
|
+
|
79
|
+
# Processing performed on the attributes before being passed to [+finder+]
|
80
|
+
def before(&block)
|
81
|
+
if !block_given? || block.arity != 1
|
82
|
+
raise InvalidBlockSpecification, "before proc should have an arity of 1 (Array: values)"
|
83
|
+
end
|
84
|
+
@options[:before] = block
|
85
|
+
end
|
86
|
+
|
87
|
+
# Processing performed on the record AFTER being passing to [+processor+]
|
88
|
+
def after(&block)
|
89
|
+
if !block_given? || block.arity != 1
|
90
|
+
raise InvalidBlockSpecification, "after proc should have an arity of 1 (~ActiveRecord: record)"
|
91
|
+
end
|
92
|
+
@options[:after] = block
|
93
|
+
end
|
94
|
+
|
95
|
+
# This method is called for each entry in the document
|
96
|
+
# Block should receive 'values' (array for plain text, hash for all others) and return a hash
|
97
|
+
# of ActiveModel attribute name to value
|
98
|
+
#
|
99
|
+
def map_attributes(&block)
|
100
|
+
if !block_given? || block.arity != 1
|
101
|
+
raise InvalidBlockSpecification, "after proc should have an arity of 1 (Hash|Array: values)"
|
102
|
+
end
|
103
|
+
@options[:map_attributes] = block
|
104
|
+
end
|
105
|
+
|
106
|
+
def build
|
107
|
+
Ingestor::Proxy.new(@file, @options)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Ingestor
|
2
|
+
module Parser
|
3
|
+
module Base
|
4
|
+
def self.included(base)
|
5
|
+
base.extend(ClassMethods)
|
6
|
+
end
|
7
|
+
|
8
|
+
module ClassMethods;end;
|
9
|
+
|
10
|
+
def initialize(proxy,document)
|
11
|
+
@proxy = proxy
|
12
|
+
@document = document
|
13
|
+
end
|
14
|
+
|
15
|
+
def options(opts)
|
16
|
+
@options = opts
|
17
|
+
end
|
18
|
+
|
19
|
+
def sample!
|
20
|
+
raise Exception, "#sample! not implemented"
|
21
|
+
end
|
22
|
+
|
23
|
+
def process!
|
24
|
+
raise Exception, "#process! not implemented"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module Ingestor
|
2
|
+
module Parser
|
3
|
+
class PlainText
|
4
|
+
include Ingestor::Parser::Base
|
5
|
+
|
6
|
+
def options(opts={})
|
7
|
+
@options = {
|
8
|
+
delimiter: '|',
|
9
|
+
line_processor: nil
|
10
|
+
}.merge(opts)
|
11
|
+
end
|
12
|
+
|
13
|
+
def process!
|
14
|
+
while line = @document.gets do
|
15
|
+
line.chomp!
|
16
|
+
attrs = @proxy.options[:map_attributes].call( process_line(line) )
|
17
|
+
@proxy.process_entry attrs
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def sample!
|
22
|
+
line = @document.gets
|
23
|
+
line.chomp!
|
24
|
+
|
25
|
+
puts line
|
26
|
+
end
|
27
|
+
|
28
|
+
protected
|
29
|
+
# Runs the default line processor or line processor provided to options
|
30
|
+
def process_line(line)
|
31
|
+
if @options[:line_processor]
|
32
|
+
@options[:line_processor].call(line)
|
33
|
+
else
|
34
|
+
default_line_processor(line)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
def default_line_processor(line)
|
38
|
+
line.split(@options[:delimiter])
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
Ingestor.register_parser :plain_text, Ingestor::Parser::PlainText
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# #require 'open-uri'
|
2
|
+
# #http://nokogiri.org/tutorials/parsing_an_html_xml_document.html
|
3
|
+
# #doc = Nokogiri::HTML(open("http://www.threescompany.com/"))
|
4
|
+
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'active_support/core_ext/hash/conversions'
|
7
|
+
|
8
|
+
module Ingestor
|
9
|
+
module Parser
|
10
|
+
class Xml
|
11
|
+
include Ingestor::Parser::Base
|
12
|
+
def options(opts={})
|
13
|
+
@options = {
|
14
|
+
encoding: nil,
|
15
|
+
xpath: nil
|
16
|
+
}.merge(opts)
|
17
|
+
end
|
18
|
+
|
19
|
+
def sample!
|
20
|
+
doc = Nokogiri::XML(@document, nil, @options[:encoding])
|
21
|
+
puts Hash.from_xml( doc.xpath(@options[:xpath]).first.to_s )
|
22
|
+
end
|
23
|
+
|
24
|
+
def process!
|
25
|
+
doc = Nokogiri::XML(@document, nil, @options[:encoding])
|
26
|
+
|
27
|
+
doc.xpath(@options[:xpath]).each do |node|
|
28
|
+
node_attrs = Hash.from_xml(node.to_s)
|
29
|
+
attrs = @proxy.options[:map_attributes].call( node_attrs )
|
30
|
+
@proxy.process_entry attrs
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
Ingestor.register_parser :xml, Ingestor::Parser::Xml
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module Ingestor
|
2
|
+
Proxy = Struct.new(:file, :options) do
|
3
|
+
def document
|
4
|
+
@document
|
5
|
+
end
|
6
|
+
|
7
|
+
def header
|
8
|
+
@header
|
9
|
+
end
|
10
|
+
|
11
|
+
def remote?
|
12
|
+
file =~ /http(s)?|ftp/i
|
13
|
+
end
|
14
|
+
|
15
|
+
def local?
|
16
|
+
!remote?
|
17
|
+
end
|
18
|
+
|
19
|
+
def working_directory
|
20
|
+
options[:working_directory]
|
21
|
+
end
|
22
|
+
|
23
|
+
def compressed?; options[:compressed]; end;
|
24
|
+
|
25
|
+
# for debugging, testing
|
26
|
+
def continue_from(line_num)
|
27
|
+
@document.rewind
|
28
|
+
@document.drop( line_num -1 ).take(1)
|
29
|
+
end
|
30
|
+
|
31
|
+
def finder
|
32
|
+
options[:finder]
|
33
|
+
end
|
34
|
+
|
35
|
+
def start!
|
36
|
+
load
|
37
|
+
Ingestor::LOG.warn("No #finder specified") if !finder
|
38
|
+
@header = @document.gets.strip if options[:includes_header]
|
39
|
+
|
40
|
+
parser = Ingestor.parser_for( options[:parser] ).new(self, @document)
|
41
|
+
parser.options( options[:parser_options] )
|
42
|
+
|
43
|
+
unless options[:sample]
|
44
|
+
parser.process!
|
45
|
+
else
|
46
|
+
parser.sample!
|
47
|
+
end
|
48
|
+
|
49
|
+
self
|
50
|
+
end
|
51
|
+
|
52
|
+
# To be called from Parsers, send a attributes, get a record
|
53
|
+
def process_entry( attrs )
|
54
|
+
options[:before].call(attrs) if options[:before]
|
55
|
+
|
56
|
+
record = finder ? finder.call(attrs) : nil
|
57
|
+
|
58
|
+
if record && record.class.ancestors.count{|r| r.to_s =~ /ActiveModel/} > 0
|
59
|
+
process_record(attrs,record)
|
60
|
+
options[:after].call(record) if options[:after]
|
61
|
+
record
|
62
|
+
else
|
63
|
+
Ingestor::LOG.warn("Processing skipped, ActiveModel type record not returned for #{attrs}")
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def process_record(attrs,record)
|
68
|
+
options[:processor] ? options[:processor].call(attrs, record) : default_processor(attrs, record)
|
69
|
+
end
|
70
|
+
|
71
|
+
def default_processor(attrs,record)
|
72
|
+
record.update_attributes( attrs, without_protection: true )
|
73
|
+
end
|
74
|
+
|
75
|
+
def load_remote
|
76
|
+
Ingestor::LOG.debug("Remote file detected #{file}...")
|
77
|
+
@document = Tempfile.new("local", working_directory)
|
78
|
+
@document.binmode if compressed?
|
79
|
+
|
80
|
+
open( file, 'rb' ) do |remote_file|
|
81
|
+
Ingestor::LOG.debug("Downloading #{file}...")
|
82
|
+
@document.write remote_file.read
|
83
|
+
@document.rewind
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# When loading compressed files the assumption is that if there is more than one
|
88
|
+
# that the files are chunked, they will be put together and treated as one large file
|
89
|
+
def load_compressed
|
90
|
+
Ingestor::LOG.debug("Compressed file detected #{file}...")
|
91
|
+
@tempfile = @document
|
92
|
+
@document = Tempfile.new("decompressed", working_directory)
|
93
|
+
@document.binmode
|
94
|
+
|
95
|
+
Zip::ZipFile.open(@tempfile.path) do |zipfile|
|
96
|
+
zipfile.each do |entry|
|
97
|
+
istream = entry.get_input_stream
|
98
|
+
@document.write istream.read
|
99
|
+
end
|
100
|
+
end
|
101
|
+
@document.rewind
|
102
|
+
end
|
103
|
+
|
104
|
+
def load
|
105
|
+
Dir.mkdir(working_directory, 0777) unless Dir.exists?(working_directory)
|
106
|
+
|
107
|
+
load_remote if remote?
|
108
|
+
load_compressed if compressed?
|
109
|
+
|
110
|
+
@document ||= File.new( file )
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# require this file to load the tasks
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
# noop
|
5
|
+
=begin
|
6
|
+
This is here as a start point for adding rake tasks that can be 'required' by another project
|
7
|
+
Just add: require 'ingestor/tasks' to your Rakefile
|
8
|
+
=end
|
9
|
+
|
10
|
+
namespace :ingestor do
|
11
|
+
desc "Example task"
|
12
|
+
task :example do
|
13
|
+
puts "I'm a task"
|
14
|
+
end
|
15
|
+
end
|
data/samples/animals.csv
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
common name, domesticated, family, subfamily, genus
|
2
|
+
chicken, true, Phasianidae, Phasianidae, Gallus
|
3
|
+
brown bear, false, Ursidae,,Ursus
|
4
|
+
cow, true, Bovidae, Bovinae, Bos
|
5
|
+
dog, true, Canidae,,Canis
|
6
|
+
cat,true, Felidae,,Felis
|
7
|
+
platypus,false,Ornithorhynchidae,,Ornithorhynchus
|