ingestor 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +21 -0
- data/Gemfile +18 -0
- data/Guardfile +11 -0
- data/LICENSE.txt +22 -0
- data/README.md +211 -0
- data/Rakefile +7 -0
- data/bin/ingest +73 -0
- data/examples/text_parsing.rb +56 -0
- data/examples/xml_parsing.rb +52 -0
- data/ingestor.gemspec +23 -0
- data/lib/ingestor.rb +37 -0
- data/lib/ingestor/dsl.rb +110 -0
- data/lib/ingestor/parser/base.rb +28 -0
- data/lib/ingestor/parser/csv.rb +8 -0
- data/lib/ingestor/parser/json.rb +8 -0
- data/lib/ingestor/parser/plain_text.rb +44 -0
- data/lib/ingestor/parser/xml.rb +37 -0
- data/lib/ingestor/proxy.rb +113 -0
- data/lib/ingestor/tasks.rb +15 -0
- data/lib/ingestor/version.rb +3 -0
- data/samples/animals.csv +7 -0
- data/samples/books.xml +32 -0
- data/samples/colors.json +30 -0
- data/samples/flags.txt +12 -0
- data/samples/people.json +26 -0
- data/spec/cassettes/remote-zipped-files.yml +186 -0
- data/spec/lib/ingestor/dsl_spec.rb +114 -0
- data/spec/lib/ingestor/parser/csv_spec.rb +5 -0
- data/spec/lib/ingestor/parser/json_spec.rb +5 -0
- data/spec/lib/ingestor/parser/plain_text_spec.rb +24 -0
- data/spec/lib/ingestor/parser/xml_spec.rb +25 -0
- data/spec/lib/ingestor/proxy_spec.rb +129 -0
- data/spec/lib/ingestor_spec.rb +19 -0
- data/spec/orm/active_record.rb +33 -0
- data/spec/orm/database.example.yml +15 -0
- data/spec/spec_helper.rb +21 -0
- metadata +139 -0
data/ingestor.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'ingestor/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "ingestor"
|
8
|
+
gem.version = Ingestor::VERSION
|
9
|
+
gem.authors = ["Cory O'Daniel"]
|
10
|
+
gem.email = ["github@coryodaniel.com"]
|
11
|
+
gem.description = "Ingesting local and remote data files into ActiveRecord"
|
12
|
+
gem.summary = "Ingesting local and remote data files into ActiveRecord"
|
13
|
+
gem.homepage = "http://github.com/coryodaniel/ingestor"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
gem.add_dependency "docile"
|
20
|
+
gem.add_dependency "rubyzip"
|
21
|
+
gem.add_dependency "thor"
|
22
|
+
gem.add_dependency "activesupport", '>= 3.2.0'
|
23
|
+
end
|
data/lib/ingestor.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'docile'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'csv'
|
4
|
+
require 'logger'
|
5
|
+
require 'zip/zipfilesystem'
|
6
|
+
require 'ingestor/version'
|
7
|
+
require 'ingestor/proxy'
|
8
|
+
require 'ingestor/dsl'
|
9
|
+
|
10
|
+
#require 'debugger'
|
11
|
+
|
12
|
+
module Ingestor
|
13
|
+
LOG = Logger.new(STDOUT)
|
14
|
+
LOG.level = Logger::WARN
|
15
|
+
class << self
|
16
|
+
def parsers
|
17
|
+
@parsers ||= {}
|
18
|
+
end
|
19
|
+
def register_parser(kind, klass)
|
20
|
+
parsers[kind] = klass
|
21
|
+
end
|
22
|
+
|
23
|
+
def parser_for(kind)
|
24
|
+
raise Exception, "No parser for type #{kind}" if parsers[kind].nil?
|
25
|
+
parsers[kind]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def ingest(filename, &block)
|
31
|
+
options = Ingestor::Dsl.new
|
32
|
+
options.file = filename
|
33
|
+
proxy = Docile.dsl_eval(options, &block).build.start!
|
34
|
+
end
|
35
|
+
|
36
|
+
require 'ingestor/parser/base'
|
37
|
+
require 'ingestor/parser/plain_text'
|
data/lib/ingestor/dsl.rb
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
module Ingestor
|
2
|
+
class Dsl
|
3
|
+
class InvalidBlockSpecification < Exception;end;
|
4
|
+
def initialize(*args)
|
5
|
+
@options = {}
|
6
|
+
|
7
|
+
includes_header(false)
|
8
|
+
compressed(false)
|
9
|
+
parser :plain_text
|
10
|
+
parser_options({})
|
11
|
+
working_directory '/tmp/ingestor'
|
12
|
+
end
|
13
|
+
|
14
|
+
def options
|
15
|
+
@options
|
16
|
+
end
|
17
|
+
|
18
|
+
# the file to retrieve
|
19
|
+
def file=(v); @file = v;end;
|
20
|
+
|
21
|
+
# When set to true sample will get the file and print out the first
|
22
|
+
# set of raw values
|
23
|
+
def sample(v)
|
24
|
+
@options[:sample] = v
|
25
|
+
end
|
26
|
+
|
27
|
+
# where the file will be moved locally for processing when it is compressed or a remote file.
|
28
|
+
# local files will not use working directory
|
29
|
+
def working_directory(v)
|
30
|
+
@options[:working_directory] = v
|
31
|
+
end
|
32
|
+
|
33
|
+
# set parser, default :plain_text
|
34
|
+
def parser(v)
|
35
|
+
@options[:parser] = v
|
36
|
+
end
|
37
|
+
|
38
|
+
# set options
|
39
|
+
def parser_options(v)
|
40
|
+
@options[:parser_options] = v
|
41
|
+
end
|
42
|
+
|
43
|
+
# skip first line?
|
44
|
+
def includes_header(v); @options[:includes_header] = v;end;
|
45
|
+
|
46
|
+
# if the remote file is compressed, this will decompress it.
|
47
|
+
def compressed(v); @options[:compressed] = v;end;
|
48
|
+
|
49
|
+
# Takes an array of values (a line/entry/node) and should return an
|
50
|
+
# ActiveModel type object
|
51
|
+
#
|
52
|
+
# You do not need to set the attributes here, than is handled by #processor
|
53
|
+
#
|
54
|
+
# update or create:
|
55
|
+
# finder{|attrs| User.where(id: attrs[:id]).first || User.new}
|
56
|
+
#
|
57
|
+
# create:
|
58
|
+
# finder{|attrs| User.new}
|
59
|
+
# @required
|
60
|
+
def finder(&block)
|
61
|
+
if !block_given? || block.arity != 1
|
62
|
+
raise InvalidBlockSpecification, "finder proc should have an arity of 1 (Array: values)"
|
63
|
+
end
|
64
|
+
@options[:finder] = block
|
65
|
+
end
|
66
|
+
|
67
|
+
# How to process an entry in a file. The default takes the values and passes them to the record returned
|
68
|
+
# by your finder and calls update attributes
|
69
|
+
# Proc should receive two parameters
|
70
|
+
# attrs - Hash, mapped attributs for this record
|
71
|
+
# record - ~ActiveRecord:Base, record found by #finder
|
72
|
+
def processor(&block)
|
73
|
+
if !block_given? || block.arity != 2
|
74
|
+
raise InvalidBlockSpecification, "processor proc should have an arity of 2 (Array: values, ~ActiveRecord: record)"
|
75
|
+
end
|
76
|
+
@options[:processor] = block
|
77
|
+
end
|
78
|
+
|
79
|
+
# Processing performed on the attributes before being passed to [+finder+]
|
80
|
+
def before(&block)
|
81
|
+
if !block_given? || block.arity != 1
|
82
|
+
raise InvalidBlockSpecification, "before proc should have an arity of 1 (Array: values)"
|
83
|
+
end
|
84
|
+
@options[:before] = block
|
85
|
+
end
|
86
|
+
|
87
|
+
# Processing performed on the record AFTER being passing to [+processor+]
|
88
|
+
def after(&block)
|
89
|
+
if !block_given? || block.arity != 1
|
90
|
+
raise InvalidBlockSpecification, "after proc should have an arity of 1 (~ActiveRecord: record)"
|
91
|
+
end
|
92
|
+
@options[:after] = block
|
93
|
+
end
|
94
|
+
|
95
|
+
# This method is called for each entry in the document
|
96
|
+
# Block should receive 'values' (array for plain text, hash for all others) and return a hash
|
97
|
+
# of ActiveModel attribute name to value
|
98
|
+
#
|
99
|
+
def map_attributes(&block)
|
100
|
+
if !block_given? || block.arity != 1
|
101
|
+
raise InvalidBlockSpecification, "after proc should have an arity of 1 (Hash|Array: values)"
|
102
|
+
end
|
103
|
+
@options[:map_attributes] = block
|
104
|
+
end
|
105
|
+
|
106
|
+
def build
|
107
|
+
Ingestor::Proxy.new(@file, @options)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Ingestor
|
2
|
+
module Parser
|
3
|
+
module Base
|
4
|
+
def self.included(base)
|
5
|
+
base.extend(ClassMethods)
|
6
|
+
end
|
7
|
+
|
8
|
+
module ClassMethods;end;
|
9
|
+
|
10
|
+
def initialize(proxy,document)
|
11
|
+
@proxy = proxy
|
12
|
+
@document = document
|
13
|
+
end
|
14
|
+
|
15
|
+
def options(opts)
|
16
|
+
@options = opts
|
17
|
+
end
|
18
|
+
|
19
|
+
def sample!
|
20
|
+
raise Exception, "#sample! not implemented"
|
21
|
+
end
|
22
|
+
|
23
|
+
def process!
|
24
|
+
raise Exception, "#process! not implemented"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module Ingestor
|
2
|
+
module Parser
|
3
|
+
class PlainText
|
4
|
+
include Ingestor::Parser::Base
|
5
|
+
|
6
|
+
def options(opts={})
|
7
|
+
@options = {
|
8
|
+
delimiter: '|',
|
9
|
+
line_processor: nil
|
10
|
+
}.merge(opts)
|
11
|
+
end
|
12
|
+
|
13
|
+
def process!
|
14
|
+
while line = @document.gets do
|
15
|
+
line.chomp!
|
16
|
+
attrs = @proxy.options[:map_attributes].call( process_line(line) )
|
17
|
+
@proxy.process_entry attrs
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def sample!
|
22
|
+
line = @document.gets
|
23
|
+
line.chomp!
|
24
|
+
|
25
|
+
puts line
|
26
|
+
end
|
27
|
+
|
28
|
+
protected
|
29
|
+
# Runs the default line processor or line processor provided to options
|
30
|
+
def process_line(line)
|
31
|
+
if @options[:line_processor]
|
32
|
+
@options[:line_processor].call(line)
|
33
|
+
else
|
34
|
+
default_line_processor(line)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
def default_line_processor(line)
|
38
|
+
line.split(@options[:delimiter])
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
Ingestor.register_parser :plain_text, Ingestor::Parser::PlainText
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# #require 'open-uri'
|
2
|
+
# #http://nokogiri.org/tutorials/parsing_an_html_xml_document.html
|
3
|
+
# #doc = Nokogiri::HTML(open("http://www.threescompany.com/"))
|
4
|
+
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'active_support/core_ext/hash/conversions'
|
7
|
+
|
8
|
+
module Ingestor
|
9
|
+
module Parser
|
10
|
+
class Xml
|
11
|
+
include Ingestor::Parser::Base
|
12
|
+
def options(opts={})
|
13
|
+
@options = {
|
14
|
+
encoding: nil,
|
15
|
+
xpath: nil
|
16
|
+
}.merge(opts)
|
17
|
+
end
|
18
|
+
|
19
|
+
def sample!
|
20
|
+
doc = Nokogiri::XML(@document, nil, @options[:encoding])
|
21
|
+
puts Hash.from_xml( doc.xpath(@options[:xpath]).first.to_s )
|
22
|
+
end
|
23
|
+
|
24
|
+
def process!
|
25
|
+
doc = Nokogiri::XML(@document, nil, @options[:encoding])
|
26
|
+
|
27
|
+
doc.xpath(@options[:xpath]).each do |node|
|
28
|
+
node_attrs = Hash.from_xml(node.to_s)
|
29
|
+
attrs = @proxy.options[:map_attributes].call( node_attrs )
|
30
|
+
@proxy.process_entry attrs
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
Ingestor.register_parser :xml, Ingestor::Parser::Xml
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module Ingestor
|
2
|
+
Proxy = Struct.new(:file, :options) do
|
3
|
+
def document
|
4
|
+
@document
|
5
|
+
end
|
6
|
+
|
7
|
+
def header
|
8
|
+
@header
|
9
|
+
end
|
10
|
+
|
11
|
+
def remote?
|
12
|
+
file =~ /http(s)?|ftp/i
|
13
|
+
end
|
14
|
+
|
15
|
+
def local?
|
16
|
+
!remote?
|
17
|
+
end
|
18
|
+
|
19
|
+
def working_directory
|
20
|
+
options[:working_directory]
|
21
|
+
end
|
22
|
+
|
23
|
+
def compressed?; options[:compressed]; end;
|
24
|
+
|
25
|
+
# for debugging, testing
|
26
|
+
def continue_from(line_num)
|
27
|
+
@document.rewind
|
28
|
+
@document.drop( line_num -1 ).take(1)
|
29
|
+
end
|
30
|
+
|
31
|
+
def finder
|
32
|
+
options[:finder]
|
33
|
+
end
|
34
|
+
|
35
|
+
def start!
|
36
|
+
load
|
37
|
+
Ingestor::LOG.warn("No #finder specified") if !finder
|
38
|
+
@header = @document.gets.strip if options[:includes_header]
|
39
|
+
|
40
|
+
parser = Ingestor.parser_for( options[:parser] ).new(self, @document)
|
41
|
+
parser.options( options[:parser_options] )
|
42
|
+
|
43
|
+
unless options[:sample]
|
44
|
+
parser.process!
|
45
|
+
else
|
46
|
+
parser.sample!
|
47
|
+
end
|
48
|
+
|
49
|
+
self
|
50
|
+
end
|
51
|
+
|
52
|
+
# To be called from Parsers, send a attributes, get a record
|
53
|
+
def process_entry( attrs )
|
54
|
+
options[:before].call(attrs) if options[:before]
|
55
|
+
|
56
|
+
record = finder ? finder.call(attrs) : nil
|
57
|
+
|
58
|
+
if record && record.class.ancestors.count{|r| r.to_s =~ /ActiveModel/} > 0
|
59
|
+
process_record(attrs,record)
|
60
|
+
options[:after].call(record) if options[:after]
|
61
|
+
record
|
62
|
+
else
|
63
|
+
Ingestor::LOG.warn("Processing skipped, ActiveModel type record not returned for #{attrs}")
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def process_record(attrs,record)
|
68
|
+
options[:processor] ? options[:processor].call(attrs, record) : default_processor(attrs, record)
|
69
|
+
end
|
70
|
+
|
71
|
+
def default_processor(attrs,record)
|
72
|
+
record.update_attributes( attrs, without_protection: true )
|
73
|
+
end
|
74
|
+
|
75
|
+
def load_remote
|
76
|
+
Ingestor::LOG.debug("Remote file detected #{file}...")
|
77
|
+
@document = Tempfile.new("local", working_directory)
|
78
|
+
@document.binmode if compressed?
|
79
|
+
|
80
|
+
open( file, 'rb' ) do |remote_file|
|
81
|
+
Ingestor::LOG.debug("Downloading #{file}...")
|
82
|
+
@document.write remote_file.read
|
83
|
+
@document.rewind
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# When loading compressed files the assumption is that if there is more than one
|
88
|
+
# that the files are chunked, they will be put together and treated as one large file
|
89
|
+
def load_compressed
|
90
|
+
Ingestor::LOG.debug("Compressed file detected #{file}...")
|
91
|
+
@tempfile = @document
|
92
|
+
@document = Tempfile.new("decompressed", working_directory)
|
93
|
+
@document.binmode
|
94
|
+
|
95
|
+
Zip::ZipFile.open(@tempfile.path) do |zipfile|
|
96
|
+
zipfile.each do |entry|
|
97
|
+
istream = entry.get_input_stream
|
98
|
+
@document.write istream.read
|
99
|
+
end
|
100
|
+
end
|
101
|
+
@document.rewind
|
102
|
+
end
|
103
|
+
|
104
|
+
def load
|
105
|
+
Dir.mkdir(working_directory, 0777) unless Dir.exists?(working_directory)
|
106
|
+
|
107
|
+
load_remote if remote?
|
108
|
+
load_compressed if compressed?
|
109
|
+
|
110
|
+
@document ||= File.new( file )
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# require this file to load the tasks
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
# noop
|
5
|
+
=begin
|
6
|
+
This is here as a start point for adding rake tasks that can be 'required' by another project
|
7
|
+
Just add: require 'ingestor/tasks' to your Rakefile
|
8
|
+
=end
|
9
|
+
|
10
|
+
namespace :ingestor do
|
11
|
+
desc "Example task"
|
12
|
+
task :example do
|
13
|
+
puts "I'm a task"
|
14
|
+
end
|
15
|
+
end
|
data/samples/animals.csv
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
common name, domesticated, family, subfamily, genus
|
2
|
+
chicken, true, Phasianidae, Phasianidae, Gallus
|
3
|
+
brown bear, false, Ursidae,,Ursus
|
4
|
+
cow, true, Bovidae, Bovinae, Bos
|
5
|
+
dog, true, Canidae,,Canis
|
6
|
+
cat,true, Felidae,,Felis
|
7
|
+
platypus,false,Ornithorhynchidae,,Ornithorhynchus
|