ingestor 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ingestor.gemspec ADDED
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'ingestor/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "ingestor"
8
+ gem.version = Ingestor::VERSION
9
+ gem.authors = ["Cory O'Daniel"]
10
+ gem.email = ["github@coryodaniel.com"]
11
+ gem.description = "Ingesting local and remote data files into ActiveRecord"
12
+ gem.summary = "Ingesting local and remote data files into ActiveRecord"
13
+ gem.homepage = "http://github.com/coryodaniel/ingestor"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ gem.add_dependency "docile"
20
+ gem.add_dependency "rubyzip"
21
+ gem.add_dependency "thor"
22
+ gem.add_dependency "activesupport", '>= 3.2.0'
23
+ end
data/lib/ingestor.rb ADDED
@@ -0,0 +1,37 @@
1
+ require 'docile'
2
+ require 'open-uri'
3
+ require 'csv'
4
+ require 'logger'
5
+ require 'zip/zipfilesystem'
6
+ require 'ingestor/version'
7
+ require 'ingestor/proxy'
8
+ require 'ingestor/dsl'
9
+
10
+ #require 'debugger'
11
+
12
+ module Ingestor
13
+ LOG = Logger.new(STDOUT)
14
+ LOG.level = Logger::WARN
15
+ class << self
16
+ def parsers
17
+ @parsers ||= {}
18
+ end
19
+ def register_parser(kind, klass)
20
+ parsers[kind] = klass
21
+ end
22
+
23
+ def parser_for(kind)
24
+ raise Exception, "No parser for type #{kind}" if parsers[kind].nil?
25
+ parsers[kind]
26
+ end
27
+ end
28
+ end
29
+
30
+ def ingest(filename, &block)
31
+ options = Ingestor::Dsl.new
32
+ options.file = filename
33
+ proxy = Docile.dsl_eval(options, &block).build.start!
34
+ end
35
+
36
+ require 'ingestor/parser/base'
37
+ require 'ingestor/parser/plain_text'
@@ -0,0 +1,110 @@
1
+ module Ingestor
2
+ class Dsl
3
+ class InvalidBlockSpecification < Exception;end;
4
+ def initialize(*args)
5
+ @options = {}
6
+
7
+ includes_header(false)
8
+ compressed(false)
9
+ parser :plain_text
10
+ parser_options({})
11
+ working_directory '/tmp/ingestor'
12
+ end
13
+
14
+ def options
15
+ @options
16
+ end
17
+
18
+ # the file to retrieve
19
+ def file=(v); @file = v;end;
20
+
21
+ # When set to true sample will get the file and print out the first
22
+ # set of raw values
23
+ def sample(v)
24
+ @options[:sample] = v
25
+ end
26
+
27
+ # where the file will be moved locally for processing when it is compressed or a remote file.
28
+ # local files will not use working directory
29
+ def working_directory(v)
30
+ @options[:working_directory] = v
31
+ end
32
+
33
+ # set parser, default :plain_text
34
+ def parser(v)
35
+ @options[:parser] = v
36
+ end
37
+
38
+ # set options
39
+ def parser_options(v)
40
+ @options[:parser_options] = v
41
+ end
42
+
43
+ # skip first line?
44
+ def includes_header(v); @options[:includes_header] = v;end;
45
+
46
+ # if the remote file is compressed, this will decompress it.
47
+ def compressed(v); @options[:compressed] = v;end;
48
+
49
+ # Takes an array of values (a line/entry/node) and should return an
50
+ # ActiveModel type object
51
+ #
52
+ # You do not need to set the attributes here, than is handled by #processor
53
+ #
54
+ # update or create:
55
+ # finder{|attrs| User.where(id: attrs[:id]).first || User.new}
56
+ #
57
+ # create:
58
+ # finder{|attrs| User.new}
59
+ # @required
60
+ def finder(&block)
61
+ if !block_given? || block.arity != 1
62
+ raise InvalidBlockSpecification, "finder proc should have an arity of 1 (Array: values)"
63
+ end
64
+ @options[:finder] = block
65
+ end
66
+
67
+ # How to process an entry in a file. The default takes the values and passes them to the record returned
68
+ # by your finder and calls update attributes
69
+ # Proc should receive two parameters
70
+ # attrs - Hash, mapped attributs for this record
71
+ # record - ~ActiveRecord:Base, record found by #finder
72
+ def processor(&block)
73
+ if !block_given? || block.arity != 2
74
+ raise InvalidBlockSpecification, "processor proc should have an arity of 2 (Array: values, ~ActiveRecord: record)"
75
+ end
76
+ @options[:processor] = block
77
+ end
78
+
79
+ # Processing performed on the attributes before being passed to [+finder+]
80
+ def before(&block)
81
+ if !block_given? || block.arity != 1
82
+ raise InvalidBlockSpecification, "before proc should have an arity of 1 (Array: values)"
83
+ end
84
+ @options[:before] = block
85
+ end
86
+
87
+ # Processing performed on the record AFTER being passing to [+processor+]
88
+ def after(&block)
89
+ if !block_given? || block.arity != 1
90
+ raise InvalidBlockSpecification, "after proc should have an arity of 1 (~ActiveRecord: record)"
91
+ end
92
+ @options[:after] = block
93
+ end
94
+
95
+ # This method is called for each entry in the document
96
+ # Block should receive 'values' (array for plain text, hash for all others) and return a hash
97
+ # of ActiveModel attribute name to value
98
+ #
99
+ def map_attributes(&block)
100
+ if !block_given? || block.arity != 1
101
+ raise InvalidBlockSpecification, "after proc should have an arity of 1 (Hash|Array: values)"
102
+ end
103
+ @options[:map_attributes] = block
104
+ end
105
+
106
+ def build
107
+ Ingestor::Proxy.new(@file, @options)
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,28 @@
1
+ module Ingestor
2
+ module Parser
3
+ module Base
4
+ def self.included(base)
5
+ base.extend(ClassMethods)
6
+ end
7
+
8
+ module ClassMethods;end;
9
+
10
+ def initialize(proxy,document)
11
+ @proxy = proxy
12
+ @document = document
13
+ end
14
+
15
+ def options(opts)
16
+ @options = opts
17
+ end
18
+
19
+ def sample!
20
+ raise Exception, "#sample! not implemented"
21
+ end
22
+
23
+ def process!
24
+ raise Exception, "#process! not implemented"
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,8 @@
1
+ module Ingestor
2
+ module Parser
3
+ class Csv
4
+ end
5
+ end
6
+ end
7
+
8
+ Ingestor.register_parser :csv, Ingestor::Parser::Csv
@@ -0,0 +1,8 @@
1
+ module Ingestor
2
+ module Parser
3
+ class Json
4
+ end
5
+ end
6
+ end
7
+
8
+ Ingestor.register_parser :json, Ingestor::Parser::Json
@@ -0,0 +1,44 @@
1
+ module Ingestor
2
+ module Parser
3
+ class PlainText
4
+ include Ingestor::Parser::Base
5
+
6
+ def options(opts={})
7
+ @options = {
8
+ delimiter: '|',
9
+ line_processor: nil
10
+ }.merge(opts)
11
+ end
12
+
13
+ def process!
14
+ while line = @document.gets do
15
+ line.chomp!
16
+ attrs = @proxy.options[:map_attributes].call( process_line(line) )
17
+ @proxy.process_entry attrs
18
+ end
19
+ end
20
+
21
+ def sample!
22
+ line = @document.gets
23
+ line.chomp!
24
+
25
+ puts line
26
+ end
27
+
28
+ protected
29
+ # Runs the default line processor or line processor provided to options
30
+ def process_line(line)
31
+ if @options[:line_processor]
32
+ @options[:line_processor].call(line)
33
+ else
34
+ default_line_processor(line)
35
+ end
36
+ end
37
+ def default_line_processor(line)
38
+ line.split(@options[:delimiter])
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ Ingestor.register_parser :plain_text, Ingestor::Parser::PlainText
@@ -0,0 +1,37 @@
1
+ # #require 'open-uri'
2
+ # #http://nokogiri.org/tutorials/parsing_an_html_xml_document.html
3
+ # #doc = Nokogiri::HTML(open("http://www.threescompany.com/"))
4
+
5
+ require 'nokogiri'
6
+ require 'active_support/core_ext/hash/conversions'
7
+
8
+ module Ingestor
9
+ module Parser
10
+ class Xml
11
+ include Ingestor::Parser::Base
12
+ def options(opts={})
13
+ @options = {
14
+ encoding: nil,
15
+ xpath: nil
16
+ }.merge(opts)
17
+ end
18
+
19
+ def sample!
20
+ doc = Nokogiri::XML(@document, nil, @options[:encoding])
21
+ puts Hash.from_xml( doc.xpath(@options[:xpath]).first.to_s )
22
+ end
23
+
24
+ def process!
25
+ doc = Nokogiri::XML(@document, nil, @options[:encoding])
26
+
27
+ doc.xpath(@options[:xpath]).each do |node|
28
+ node_attrs = Hash.from_xml(node.to_s)
29
+ attrs = @proxy.options[:map_attributes].call( node_attrs )
30
+ @proxy.process_entry attrs
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ Ingestor.register_parser :xml, Ingestor::Parser::Xml
@@ -0,0 +1,113 @@
1
+ module Ingestor
2
+ Proxy = Struct.new(:file, :options) do
3
+ def document
4
+ @document
5
+ end
6
+
7
+ def header
8
+ @header
9
+ end
10
+
11
+ def remote?
12
+ file =~ /http(s)?|ftp/i
13
+ end
14
+
15
+ def local?
16
+ !remote?
17
+ end
18
+
19
+ def working_directory
20
+ options[:working_directory]
21
+ end
22
+
23
+ def compressed?; options[:compressed]; end;
24
+
25
+ # for debugging, testing
26
+ def continue_from(line_num)
27
+ @document.rewind
28
+ @document.drop( line_num -1 ).take(1)
29
+ end
30
+
31
+ def finder
32
+ options[:finder]
33
+ end
34
+
35
+ def start!
36
+ load
37
+ Ingestor::LOG.warn("No #finder specified") if !finder
38
+ @header = @document.gets.strip if options[:includes_header]
39
+
40
+ parser = Ingestor.parser_for( options[:parser] ).new(self, @document)
41
+ parser.options( options[:parser_options] )
42
+
43
+ unless options[:sample]
44
+ parser.process!
45
+ else
46
+ parser.sample!
47
+ end
48
+
49
+ self
50
+ end
51
+
52
+ # To be called from Parsers, send a attributes, get a record
53
+ def process_entry( attrs )
54
+ options[:before].call(attrs) if options[:before]
55
+
56
+ record = finder ? finder.call(attrs) : nil
57
+
58
+ if record && record.class.ancestors.count{|r| r.to_s =~ /ActiveModel/} > 0
59
+ process_record(attrs,record)
60
+ options[:after].call(record) if options[:after]
61
+ record
62
+ else
63
+ Ingestor::LOG.warn("Processing skipped, ActiveModel type record not returned for #{attrs}")
64
+ end
65
+ end
66
+
67
+ def process_record(attrs,record)
68
+ options[:processor] ? options[:processor].call(attrs, record) : default_processor(attrs, record)
69
+ end
70
+
71
+ def default_processor(attrs,record)
72
+ record.update_attributes( attrs, without_protection: true )
73
+ end
74
+
75
+ def load_remote
76
+ Ingestor::LOG.debug("Remote file detected #{file}...")
77
+ @document = Tempfile.new("local", working_directory)
78
+ @document.binmode if compressed?
79
+
80
+ open( file, 'rb' ) do |remote_file|
81
+ Ingestor::LOG.debug("Downloading #{file}...")
82
+ @document.write remote_file.read
83
+ @document.rewind
84
+ end
85
+ end
86
+
87
+ # When loading compressed files the assumption is that if there is more than one
88
+ # that the files are chunked, they will be put together and treated as one large file
89
+ def load_compressed
90
+ Ingestor::LOG.debug("Compressed file detected #{file}...")
91
+ @tempfile = @document
92
+ @document = Tempfile.new("decompressed", working_directory)
93
+ @document.binmode
94
+
95
+ Zip::ZipFile.open(@tempfile.path) do |zipfile|
96
+ zipfile.each do |entry|
97
+ istream = entry.get_input_stream
98
+ @document.write istream.read
99
+ end
100
+ end
101
+ @document.rewind
102
+ end
103
+
104
+ def load
105
+ Dir.mkdir(working_directory, 0777) unless Dir.exists?(working_directory)
106
+
107
+ load_remote if remote?
108
+ load_compressed if compressed?
109
+
110
+ @document ||= File.new( file )
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,15 @@
1
+ # require this file to load the tasks
2
+ require 'rake'
3
+
4
+ # noop
5
+ =begin
6
+ This is here as a start point for adding rake tasks that can be 'required' by another project
7
+ Just add: require 'ingestor/tasks' to your Rakefile
8
+ =end
9
+
10
+ namespace :ingestor do
11
+ desc "Example task"
12
+ task :example do
13
+ puts "I'm a task"
14
+ end
15
+ end
@@ -0,0 +1,3 @@
1
+ module Ingestor
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,7 @@
1
+ common name, domesticated, family, subfamily, genus
2
+ chicken, true, Phasianidae, Phasianidae, Gallus
3
+ brown bear, false, Ursidae,,Ursus
4
+ cow, true, Bovidae, Bovinae, Bos
5
+ dog, true, Canidae,,Canis
6
+ cat,true, Felidae,,Felis
7
+ platypus,false,Ornithorhynchidae,,Ornithorhynchus