ingestor 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/ingestor.gemspec ADDED
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'ingestor/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "ingestor"
8
+ gem.version = Ingestor::VERSION
9
+ gem.authors = ["Cory O'Daniel"]
10
+ gem.email = ["github@coryodaniel.com"]
11
+ gem.description = "Ingesting local and remote data files into ActiveRecord"
12
+ gem.summary = "Ingesting local and remote data files into ActiveRecord"
13
+ gem.homepage = "http://github.com/coryodaniel/ingestor"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ gem.add_dependency "docile"
20
+ gem.add_dependency "rubyzip"
21
+ gem.add_dependency "thor"
22
+ gem.add_dependency "activesupport", '>= 3.2.0'
23
+ end
data/lib/ingestor.rb ADDED
@@ -0,0 +1,37 @@
1
+ require 'docile'
2
+ require 'open-uri'
3
+ require 'csv'
4
+ require 'logger'
5
+ require 'zip/zipfilesystem'
6
+ require 'ingestor/version'
7
+ require 'ingestor/proxy'
8
+ require 'ingestor/dsl'
9
+
10
+ #require 'debugger'
11
+
12
+ module Ingestor
13
+ LOG = Logger.new(STDOUT)
14
+ LOG.level = Logger::WARN
15
+ class << self
16
+ def parsers
17
+ @parsers ||= {}
18
+ end
19
+ def register_parser(kind, klass)
20
+ parsers[kind] = klass
21
+ end
22
+
23
+ def parser_for(kind)
24
+ raise Exception, "No parser for type #{kind}" if parsers[kind].nil?
25
+ parsers[kind]
26
+ end
27
+ end
28
+ end
29
+
30
+ def ingest(filename, &block)
31
+ options = Ingestor::Dsl.new
32
+ options.file = filename
33
+ proxy = Docile.dsl_eval(options, &block).build.start!
34
+ end
35
+
36
+ require 'ingestor/parser/base'
37
+ require 'ingestor/parser/plain_text'
@@ -0,0 +1,110 @@
1
+ module Ingestor
2
+ class Dsl
3
+ class InvalidBlockSpecification < Exception;end;
4
+ def initialize(*args)
5
+ @options = {}
6
+
7
+ includes_header(false)
8
+ compressed(false)
9
+ parser :plain_text
10
+ parser_options({})
11
+ working_directory '/tmp/ingestor'
12
+ end
13
+
14
+ def options
15
+ @options
16
+ end
17
+
18
+ # the file to retrieve
19
+ def file=(v); @file = v;end;
20
+
21
+ # When set to true sample will get the file and print out the first
22
+ # set of raw values
23
+ def sample(v)
24
+ @options[:sample] = v
25
+ end
26
+
27
+ # where the file will be moved locally for processing when it is compressed or a remote file.
28
+ # local files will not use working directory
29
+ def working_directory(v)
30
+ @options[:working_directory] = v
31
+ end
32
+
33
+ # set parser, default :plain_text
34
+ def parser(v)
35
+ @options[:parser] = v
36
+ end
37
+
38
+ # set options
39
+ def parser_options(v)
40
+ @options[:parser_options] = v
41
+ end
42
+
43
+ # skip first line?
44
+ def includes_header(v); @options[:includes_header] = v;end;
45
+
46
+ # if the remote file is compressed, this will decompress it.
47
+ def compressed(v); @options[:compressed] = v;end;
48
+
49
+ # Takes an array of values (a line/entry/node) and should return an
50
+ # ActiveModel type object
51
+ #
52
+ # You do not need to set the attributes here, than is handled by #processor
53
+ #
54
+ # update or create:
55
+ # finder{|attrs| User.where(id: attrs[:id]).first || User.new}
56
+ #
57
+ # create:
58
+ # finder{|attrs| User.new}
59
+ # @required
60
+ def finder(&block)
61
+ if !block_given? || block.arity != 1
62
+ raise InvalidBlockSpecification, "finder proc should have an arity of 1 (Array: values)"
63
+ end
64
+ @options[:finder] = block
65
+ end
66
+
67
+ # How to process an entry in a file. The default takes the values and passes them to the record returned
68
+ # by your finder and calls update attributes
69
+ # Proc should receive two parameters
70
+ # attrs - Hash, mapped attributs for this record
71
+ # record - ~ActiveRecord:Base, record found by #finder
72
+ def processor(&block)
73
+ if !block_given? || block.arity != 2
74
+ raise InvalidBlockSpecification, "processor proc should have an arity of 2 (Array: values, ~ActiveRecord: record)"
75
+ end
76
+ @options[:processor] = block
77
+ end
78
+
79
+ # Processing performed on the attributes before being passed to [+finder+]
80
+ def before(&block)
81
+ if !block_given? || block.arity != 1
82
+ raise InvalidBlockSpecification, "before proc should have an arity of 1 (Array: values)"
83
+ end
84
+ @options[:before] = block
85
+ end
86
+
87
+ # Processing performed on the record AFTER being passing to [+processor+]
88
+ def after(&block)
89
+ if !block_given? || block.arity != 1
90
+ raise InvalidBlockSpecification, "after proc should have an arity of 1 (~ActiveRecord: record)"
91
+ end
92
+ @options[:after] = block
93
+ end
94
+
95
+ # This method is called for each entry in the document
96
+ # Block should receive 'values' (array for plain text, hash for all others) and return a hash
97
+ # of ActiveModel attribute name to value
98
+ #
99
+ def map_attributes(&block)
100
+ if !block_given? || block.arity != 1
101
+ raise InvalidBlockSpecification, "after proc should have an arity of 1 (Hash|Array: values)"
102
+ end
103
+ @options[:map_attributes] = block
104
+ end
105
+
106
+ def build
107
+ Ingestor::Proxy.new(@file, @options)
108
+ end
109
+ end
110
+ end
@@ -0,0 +1,28 @@
1
+ module Ingestor
2
+ module Parser
3
+ module Base
4
+ def self.included(base)
5
+ base.extend(ClassMethods)
6
+ end
7
+
8
+ module ClassMethods;end;
9
+
10
+ def initialize(proxy,document)
11
+ @proxy = proxy
12
+ @document = document
13
+ end
14
+
15
+ def options(opts)
16
+ @options = opts
17
+ end
18
+
19
+ def sample!
20
+ raise Exception, "#sample! not implemented"
21
+ end
22
+
23
+ def process!
24
+ raise Exception, "#process! not implemented"
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,8 @@
1
+ module Ingestor
2
+ module Parser
3
+ class Csv
4
+ end
5
+ end
6
+ end
7
+
8
+ Ingestor.register_parser :csv, Ingestor::Parser::Csv
@@ -0,0 +1,8 @@
1
+ module Ingestor
2
+ module Parser
3
+ class Json
4
+ end
5
+ end
6
+ end
7
+
8
+ Ingestor.register_parser :json, Ingestor::Parser::Json
@@ -0,0 +1,44 @@
1
+ module Ingestor
2
+ module Parser
3
+ class PlainText
4
+ include Ingestor::Parser::Base
5
+
6
+ def options(opts={})
7
+ @options = {
8
+ delimiter: '|',
9
+ line_processor: nil
10
+ }.merge(opts)
11
+ end
12
+
13
+ def process!
14
+ while line = @document.gets do
15
+ line.chomp!
16
+ attrs = @proxy.options[:map_attributes].call( process_line(line) )
17
+ @proxy.process_entry attrs
18
+ end
19
+ end
20
+
21
+ def sample!
22
+ line = @document.gets
23
+ line.chomp!
24
+
25
+ puts line
26
+ end
27
+
28
+ protected
29
+ # Runs the default line processor or line processor provided to options
30
+ def process_line(line)
31
+ if @options[:line_processor]
32
+ @options[:line_processor].call(line)
33
+ else
34
+ default_line_processor(line)
35
+ end
36
+ end
37
+ def default_line_processor(line)
38
+ line.split(@options[:delimiter])
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ Ingestor.register_parser :plain_text, Ingestor::Parser::PlainText
@@ -0,0 +1,37 @@
1
+ # #require 'open-uri'
2
+ # #http://nokogiri.org/tutorials/parsing_an_html_xml_document.html
3
+ # #doc = Nokogiri::HTML(open("http://www.threescompany.com/"))
4
+
5
+ require 'nokogiri'
6
+ require 'active_support/core_ext/hash/conversions'
7
+
8
+ module Ingestor
9
+ module Parser
10
+ class Xml
11
+ include Ingestor::Parser::Base
12
+ def options(opts={})
13
+ @options = {
14
+ encoding: nil,
15
+ xpath: nil
16
+ }.merge(opts)
17
+ end
18
+
19
+ def sample!
20
+ doc = Nokogiri::XML(@document, nil, @options[:encoding])
21
+ puts Hash.from_xml( doc.xpath(@options[:xpath]).first.to_s )
22
+ end
23
+
24
+ def process!
25
+ doc = Nokogiri::XML(@document, nil, @options[:encoding])
26
+
27
+ doc.xpath(@options[:xpath]).each do |node|
28
+ node_attrs = Hash.from_xml(node.to_s)
29
+ attrs = @proxy.options[:map_attributes].call( node_attrs )
30
+ @proxy.process_entry attrs
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ Ingestor.register_parser :xml, Ingestor::Parser::Xml
@@ -0,0 +1,113 @@
1
+ module Ingestor
2
+ Proxy = Struct.new(:file, :options) do
3
+ def document
4
+ @document
5
+ end
6
+
7
+ def header
8
+ @header
9
+ end
10
+
11
+ def remote?
12
+ file =~ /http(s)?|ftp/i
13
+ end
14
+
15
+ def local?
16
+ !remote?
17
+ end
18
+
19
+ def working_directory
20
+ options[:working_directory]
21
+ end
22
+
23
+ def compressed?; options[:compressed]; end;
24
+
25
+ # for debugging, testing
26
+ def continue_from(line_num)
27
+ @document.rewind
28
+ @document.drop( line_num -1 ).take(1)
29
+ end
30
+
31
+ def finder
32
+ options[:finder]
33
+ end
34
+
35
+ def start!
36
+ load
37
+ Ingestor::LOG.warn("No #finder specified") if !finder
38
+ @header = @document.gets.strip if options[:includes_header]
39
+
40
+ parser = Ingestor.parser_for( options[:parser] ).new(self, @document)
41
+ parser.options( options[:parser_options] )
42
+
43
+ unless options[:sample]
44
+ parser.process!
45
+ else
46
+ parser.sample!
47
+ end
48
+
49
+ self
50
+ end
51
+
52
+ # To be called from Parsers, send a attributes, get a record
53
+ def process_entry( attrs )
54
+ options[:before].call(attrs) if options[:before]
55
+
56
+ record = finder ? finder.call(attrs) : nil
57
+
58
+ if record && record.class.ancestors.count{|r| r.to_s =~ /ActiveModel/} > 0
59
+ process_record(attrs,record)
60
+ options[:after].call(record) if options[:after]
61
+ record
62
+ else
63
+ Ingestor::LOG.warn("Processing skipped, ActiveModel type record not returned for #{attrs}")
64
+ end
65
+ end
66
+
67
+ def process_record(attrs,record)
68
+ options[:processor] ? options[:processor].call(attrs, record) : default_processor(attrs, record)
69
+ end
70
+
71
+ def default_processor(attrs,record)
72
+ record.update_attributes( attrs, without_protection: true )
73
+ end
74
+
75
+ def load_remote
76
+ Ingestor::LOG.debug("Remote file detected #{file}...")
77
+ @document = Tempfile.new("local", working_directory)
78
+ @document.binmode if compressed?
79
+
80
+ open( file, 'rb' ) do |remote_file|
81
+ Ingestor::LOG.debug("Downloading #{file}...")
82
+ @document.write remote_file.read
83
+ @document.rewind
84
+ end
85
+ end
86
+
87
+ # When loading compressed files the assumption is that if there is more than one
88
+ # that the files are chunked, they will be put together and treated as one large file
89
+ def load_compressed
90
+ Ingestor::LOG.debug("Compressed file detected #{file}...")
91
+ @tempfile = @document
92
+ @document = Tempfile.new("decompressed", working_directory)
93
+ @document.binmode
94
+
95
+ Zip::ZipFile.open(@tempfile.path) do |zipfile|
96
+ zipfile.each do |entry|
97
+ istream = entry.get_input_stream
98
+ @document.write istream.read
99
+ end
100
+ end
101
+ @document.rewind
102
+ end
103
+
104
+ def load
105
+ Dir.mkdir(working_directory, 0777) unless Dir.exists?(working_directory)
106
+
107
+ load_remote if remote?
108
+ load_compressed if compressed?
109
+
110
+ @document ||= File.new( file )
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,15 @@
1
+ # require this file to load the tasks
2
+ require 'rake'
3
+
4
+ # noop
5
+ =begin
6
+ This is here as a start point for adding rake tasks that can be 'required' by another project
7
+ Just add: require 'ingestor/tasks' to your Rakefile
8
+ =end
9
+
10
+ namespace :ingestor do
11
+ desc "Example task"
12
+ task :example do
13
+ puts "I'm a task"
14
+ end
15
+ end
@@ -0,0 +1,3 @@
1
+ module Ingestor
2
+ VERSION = "0.1.1"
3
+ end
@@ -0,0 +1,7 @@
1
+ common name, domesticated, family, subfamily, genus
2
+ chicken, true, Phasianidae, Phasianidae, Gallus
3
+ brown bear, false, Ursidae,,Ursus
4
+ cow, true, Bovidae, Bovinae, Bos
5
+ dog, true, Canidae,,Canis
6
+ cat,true, Felidae,,Felis
7
+ platypus,false,Ornithorhynchidae,,Ornithorhynchus