remote_table-ruby19 0.2.30

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
@@ -0,0 +1,3 @@
1
+ 0.1.6
2
+ * For CSVs, force convert headers using String#toutf8. :encoding => 'N'|'U' didn't work.
3
+ * Fix handling of long urls when passing off to Tempfile.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Brighter Planet
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,27 @@
1
+ =remote_table
2
+
3
+ Remotely open and parse Excel XLS, ODS, CSV and fixed-width tables.
4
+
5
+ ==Real-life usage
6
+
7
+ Used by data_miner (http://github.com/seamusabshere/data_miner)
8
+
9
+ ==Example
10
+
11
+ Taken from <tt>#{GEMDIR}/test/remote_table_test.rb</tt>:
12
+
13
+ >> t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv')
14
+ => #<RemoteTable:0x359da50 @transform=#<RemoteTable::Transform:0x359d154 @select=nil, @reject=nil>, @file=#<RemoteTable::File:0x35970c4 @delimiter=nil, @headers=nil, @cut=nil, @filename="98guide6.csv", @skip=nil, @schema_name=nil, @crop=nil, @format=:csv, @trap=nil, @sheet=0, @schema=nil>, @package=#<RemoteTable::Package:0x359c538 @url="http://www.fueleconomy.gov/FEG/epadata/98guide6.zip", @filename="98guide6.csv", @compression=:zip, @packing=nil>, @request=#<RemoteTable::Request:0x3596bec @url="http://www.fueleconomy.gov/FEG/epadata/98guide6.zip">>
15
+ >> t.rows.first
16
+ => {"cyl"=>"6", "eng dscr"=>"DOHC VTEC", "trans dscr"=>"2MODE CLKUP", "trans"=>"Auto(L4)", "cmb"=>"20", "2pv"=>nil, "carline name"=>"NSX", "displ"=>"3.0", "ucmb"=>"23.5311", "hpv"=>nil, "4pv"=>nil, "Class"=>"TWO SEATERS", "Manufacturer"=>"ACURA", "fl"=>"P", "2lv"=>nil, "G"=>nil, "hlv"=>nil, "drv"=>"R", "cty"=>"18", "ucty"=>"19.8733", "S"=>nil, "4lv"=>nil, "fcost"=>"1050", "T"=>nil, "hwy"=>"24", "uhwy"=>"30.3612"}
17
+
18
+ See the test file and also data_miner examples of custom parsers.
19
+
20
+ ==Authors
21
+
22
+ * Seamus Abshere <seamus@abshere.net>
23
+ * Andy Rossmeissl <andy@rossmeissl.net>
24
+
25
+ == Copyright
26
+
27
+ Copyright (c) 2010 Brighter Planet. See LICENSE for details.
@@ -0,0 +1,70 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "remote_table"
8
+ gem.summary = %Q{Remotely open and parse XLS, ODS, CSV and fixed-width tables.}
9
+ gem.description = %Q{Remotely open and parse Excel XLS, ODS, CSV and fixed-width tables.}
10
+ gem.email = "seamus@abshere.net"
11
+ gem.homepage = "http://github.com/seamusabshere/remote_table"
12
+ gem.authors = ["Seamus Abshere", "Andy Rossmeissl"]
13
+ gem.add_dependency 'roo', '1.3.11' # 1.9.3 breaks
14
+ gem.add_dependency 'fastercsv', '>=1.5.0'
15
+ gem.add_dependency 'activesupport', '>=2.3.4'
16
+ gem.add_dependency 'slither', '>=0.99.3'
17
+ gem.add_dependency 'nokogiri', '>=1.4.1'
18
+ gem.add_dependency 'escape', '>=0.0.4'
19
+ gem.add_development_dependency 'errata', '>=0.2.0'
20
+ gem.require_path = "lib"
21
+ gem.rdoc_options << '--line-numbers' << '--inline-source'
22
+ gem.requirements << 'curl'
23
+ gem.rubyforge_project = "remotetable"
24
+ end
25
+ Jeweler::GemcutterTasks.new
26
+ Jeweler::RubyforgeTasks.new do |rubyforge|
27
+ rubyforge.doc_task = "rdoc"
28
+ end
29
+ rescue LoadError
30
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
31
+ end
32
+
33
+ require 'rake/testtask'
34
+ Rake::TestTask.new(:test) do |test|
35
+ test.libs << 'lib' << 'test'
36
+ test.pattern = 'test/**/*_test.rb'
37
+ test.verbose = true
38
+ end
39
+
40
+ begin
41
+ require 'rcov/rcovtask'
42
+ Rcov::RcovTask.new do |test|
43
+ test.libs << 'test'
44
+ test.pattern = 'test/**/*_test.rb'
45
+ test.verbose = true
46
+ end
47
+ rescue LoadError
48
+ task :rcov do
49
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
50
+ end
51
+ end
52
+
53
+
54
+
55
+
56
+ task :default => :test
57
+
58
+ require 'rake/rdoctask'
59
+ Rake::RDocTask.new do |rdoc|
60
+ if File.exist?('VERSION')
61
+ version = File.read('VERSION')
62
+ else
63
+ version = ""
64
+ end
65
+
66
+ rdoc.rdoc_dir = 'rdoc'
67
+ rdoc.title = "remote_table #{version}"
68
+ rdoc.rdoc_files.include('README*')
69
+ rdoc.rdoc_files.include('lib/**/*.rb')
70
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.30
@@ -0,0 +1,117 @@
1
+ require 'digest/md5'
2
+ require 'uri'
3
+ require 'tmpdir'
4
+ require 'active_support'
5
+ require 'active_support/version'
6
+ %w{
7
+ active_support/core_ext/string/conversions
8
+ active_support/core_ext/object/blank
9
+ active_support/core_ext/string/inflections
10
+ active_support/core_ext/array/wrap
11
+ active_support/core_ext/hash/except
12
+ active_support/core_ext/class/attribute_accessors
13
+ }.each do |active_support_3_requirement|
14
+ require active_support_3_requirement
15
+ end if ActiveSupport::VERSION::MAJOR == 3
16
+
17
+ require 'escape'
18
+ require 'slither'
19
+ require 'roo'
20
+ I_KNOW_I_AM_USING_AN_OLD_AND_BUGGY_VERSION_OF_LIBXML2 = true
21
+ require 'nokogiri'
22
+ require 'remote_table/transform'
23
+ require 'remote_table/request'
24
+ require 'remote_table/package'
25
+ require 'remote_table/file'
26
+ require 'remote_table/file/csv'
27
+ require 'remote_table/file/fixed_width'
28
+ require 'remote_table/file/roo_spreadsheet'
29
+ require 'remote_table/file/ods'
30
+ require 'remote_table/file/xls'
31
+ require 'remote_table/file/xlsx'
32
+ require 'remote_table/file/html'
33
+
34
+ class RemoteTable
35
+ cattr_accessor :paths_for_removal
36
+ class << self
37
+ def cleanup
38
+ paths_for_removal.each do |path|
39
+ FileUtils.rm_rf path
40
+ paths_for_removal.delete path
41
+ end if paths_for_removal.is_a?(Array)
42
+ end
43
+
44
+ def remove_at_exit(path)
45
+ self.paths_for_removal ||= Array.new
46
+ paths_for_removal.push path
47
+ end
48
+ end
49
+
50
+ attr_accessor :request, :package, :file, :transform
51
+ attr_accessor :table
52
+
53
+ include Enumerable
54
+
55
+ def initialize(bus)
56
+ @transform = Transform.new(bus)
57
+ @package = Package.new(bus)
58
+ @request = Request.new(bus)
59
+ @file = File.new(bus)
60
+ at_exit { RemoteTable.cleanup }
61
+ end
62
+
63
+ def each
64
+ finish_table! unless table
65
+ table.each_row { |row| yield row }
66
+ end
67
+ alias :each_row :each
68
+
69
+ def to_a
70
+ cache_rows! if @_row_cache.nil?
71
+ @_row_cache
72
+ end
73
+ alias :rows :to_a
74
+
75
+ def <=>(other)
76
+ raise "Not implemented"
77
+ end
78
+
79
+ protected
80
+
81
+ def self.bang(path, cmd)
82
+ tmp_path = "#{path}.tmp"
83
+ RemoteTable.backtick_with_reporting "cat #{Escape.shell_single_word path} | #{cmd} > #{Escape.shell_single_word tmp_path}"
84
+ FileUtils.mv tmp_path, path
85
+ end
86
+
87
+ # TODO this should probably live somewhere else
88
+ def self.backtick_with_reporting(cmd)
89
+ cmd = cmd.gsub /[ ]*\n[ ]*/m, ' '
90
+ output = `#{cmd}`
91
+ if not $?.success?
92
+ raise %{
93
+ From the remote_table gem...
94
+
95
+ Command failed:
96
+ #{cmd}
97
+
98
+ Output:
99
+ #{output}
100
+ }
101
+ end
102
+ end
103
+
104
+ private
105
+
106
+ def finish_table!
107
+ package_path = request.download
108
+ file_path = package.stage(package_path)
109
+ raw_table = file.tabulate(file_path)
110
+ self.table = transform.apply(raw_table) # must return something that responds to each_row
111
+ end
112
+
113
+ def cache_rows!
114
+ @_row_cache = []
115
+ each_row { |row| @_row_cache << row }
116
+ end
117
+ end
@@ -0,0 +1,100 @@
1
+ class RemoteTable
2
+ class File
3
+ attr_accessor :filename, :format, :delimiter, :skip, :cut, :crop, :sheet, :headers, :schema, :schema_name, :trap
4
+ attr_accessor :encoding
5
+ attr_accessor :path
6
+ attr_accessor :keep_blank_rows
7
+ attr_accessor :row_xpath
8
+ attr_accessor :column_xpath
9
+
10
+ def initialize(bus)
11
+ @filename = bus[:filename]
12
+ @format = bus[:format] || format_from_filename
13
+ @delimiter = bus[:delimiter]
14
+ @sheet = bus[:sheet] || 0
15
+ @skip = bus[:skip] # rows
16
+ @keep_blank_rows = bus[:keep_blank_rows] || false
17
+ @crop = bus[:crop] # rows
18
+ @cut = bus[:cut] # columns
19
+ @headers = bus[:headers]
20
+ @schema = bus[:schema]
21
+ @schema_name = bus[:schema_name]
22
+ @trap = bus[:trap]
23
+ @encoding = bus[:encoding] || 'UTF-8'
24
+ @row_xpath = bus[:row_xpath]
25
+ @column_xpath = bus[:column_xpath]
26
+ extend "RemoteTable::#{format.to_s.camelcase}".constantize
27
+ end
28
+
29
+ def tabulate(path)
30
+ define_fixed_width_schema! if format == :fixed_width and schema.is_a?(Array) # TODO move to generic subclass callback
31
+ self.path = path
32
+ self
33
+ end
34
+
35
+ private
36
+
37
+ # doesn't support trap
38
+ def define_fixed_width_schema!
39
+ raise "can't define both schema_name and schema" if !schema_name.blank?
40
+ self.schema_name = "autogenerated_#{filename.gsub(/[^a-z0-9_]/i, '')}".to_sym
41
+ self.trap ||= lambda { true }
42
+ Slither.define schema_name do |d|
43
+ d.rows do |row|
44
+ row.trap(&trap)
45
+ schema.each do |name, width, options|
46
+ if name == 'spacer'
47
+ row.spacer width
48
+ else
49
+ row.column name, width, options
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+
56
+ def backup_file!
57
+ FileUtils.cp path, "#{path}.backup"
58
+ end
59
+
60
+ def skip_rows!
61
+ return unless skip
62
+ RemoteTable.bang path, "tail -n +#{skip + 1}"
63
+ end
64
+
65
+ USELESS_CHARACTERS = [
66
+ '\xef\xbb\xbf', # UTF-8 byte order mark
67
+ '\xc2\xad' # soft hyphen, often inserted by MS Office (html: &shy;)
68
+ ]
69
+ def remove_useless_characters!
70
+ RemoteTable.bang path, "perl -pe 's/#{USELESS_CHARACTERS.join '//g; s/'}//g'"
71
+ end
72
+
73
+ def convert_file_to_utf8!
74
+ RemoteTable.bang path, "iconv -c -f #{Escape.shell_single_word encoding} -t UTF-8"
75
+ end
76
+
77
+ def restore_file!
78
+ FileUtils.mv "#{path}.backup", path if ::File.readable? "#{path}.backup"
79
+ end
80
+
81
+ def cut_columns!
82
+ return unless cut
83
+ RemoteTable.bang path, "cut -c #{Escape.shell_single_word cut.to_s}"
84
+ end
85
+
86
+ def crop_rows!
87
+ return unless crop
88
+ RemoteTable.bang path, "tail -n +#{Escape.shell_single_word crop.first.to_s} | head -n #{crop.last - crop.first + 1}"
89
+ end
90
+
91
+ def format_from_filename
92
+ extname = ::File.extname(filename).gsub('.', '')
93
+ return :csv if extname.blank?
94
+ format = [ :xls, :ods, :xlsx ].detect { |i| i == extname.to_sym }
95
+ format = :html if extname =~ /\Ahtm/
96
+ format = :csv if format.blank?
97
+ format
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,57 @@
1
+ if RUBY_VERSION >= "1.9"
2
+ require 'csv'
3
+ ::RemoteTable::MyCSV = ::CSV
4
+ else
5
+ require 'fastercsv'
6
+ ::RemoteTable::MyCSV = ::FasterCSV
7
+ end
8
+
9
+ class RemoteTable
10
+ module Csv
11
+ def each_row(&block)
12
+ backup_file!
13
+ convert_file_to_utf8!
14
+ remove_useless_characters!
15
+ skip_rows!
16
+ ::RemoteTable::MyCSV.foreach(path, fastercsv_options) do |row|
17
+ ordered_hash = ActiveSupport::OrderedHash.new
18
+ filled_values = 0
19
+ case row
20
+ when ::RemoteTable::MyCSV::Row
21
+ row.each do |header, value|
22
+ next if header.blank?
23
+ value = '' if value.nil?
24
+ ordered_hash[header] = value
25
+ filled_values += 1 if value.present?
26
+ end
27
+ when Array
28
+ index = 0
29
+ row.each do |value|
30
+ value = '' if value.nil?
31
+ ordered_hash[index] = value
32
+ filled_values += 1 if value.present?
33
+ index += 1
34
+ end
35
+ else
36
+ raise "Unexpected #{row.inspect}"
37
+ end
38
+ yield ordered_hash if keep_blank_rows or filled_values.nonzero?
39
+ end
40
+ ensure
41
+ restore_file!
42
+ end
43
+
44
+ private
45
+
46
+ def fastercsv_options
47
+ fastercsv_options = { :skip_blanks => !keep_blank_rows }
48
+ if headers == false
49
+ fastercsv_options.merge!(:headers => nil)
50
+ else
51
+ fastercsv_options.merge!(:headers => :first_row)
52
+ end
53
+ fastercsv_options.merge!(:col_sep => delimiter) if delimiter
54
+ fastercsv_options
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,19 @@
1
+ class RemoteTable
2
+ module FixedWidth
3
+ def each_row(&block)
4
+ backup_file!
5
+ convert_file_to_utf8!
6
+ remove_useless_characters!
7
+ crop_rows!
8
+ skip_rows!
9
+ cut_columns!
10
+ a = Slither.parse(path, schema_name)
11
+ a[:rows].each do |hash|
12
+ hash.reject! { |k, v| k.blank? }
13
+ yield hash if keep_blank_rows or hash.any? { |k, v| v.present? }
14
+ end
15
+ ensure
16
+ restore_file!
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,37 @@
1
+ class RemoteTable
2
+ module Html
3
+ def each_row(&block)
4
+ backup_file!
5
+ convert_file_to_utf8!
6
+ remove_useless_characters!
7
+ html_headers = (headers.is_a?(Array)) ? headers : nil
8
+ Nokogiri::HTML(unescaped_html_without_soft_hyphens, nil, 'UTF-8').xpath(row_xpath).each do |row|
9
+ values = row.xpath(column_xpath).map { |td| td.content.gsub(/\s+/, ' ').strip }
10
+ if html_headers.nil?
11
+ html_headers = values
12
+ next
13
+ end
14
+ hash = zip html_headers, values
15
+ yield hash if keep_blank_rows or hash.any? { |k, v| v.present? }
16
+ end
17
+ ensure
18
+ restore_file!
19
+ end
20
+
21
+ private
22
+
23
+ # http://snippets.dzone.com/posts/show/406
24
+ def zip(keys, values)
25
+ hash = Hash.new
26
+ keys.zip(values) { |k,v| hash[k]=v }
27
+ hash
28
+ end
29
+
30
+ # should we be doing this in ruby?
31
+ def unescaped_html_without_soft_hyphens
32
+ str = CGI.unescapeHTML IO.read(path)
33
+ str.gsub! /&shy;/, ''
34
+ str
35
+ end
36
+ end
37
+ end