remote_table-ruby19 0.2.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
@@ -0,0 +1,3 @@
1
+ 0.1.6
2
+ * For CSVs, force convert headers using String#toutf8. :encoding => 'N'|'U' didn't work.
3
+ * Fix handling of long urls when passing off to Tempfile.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Brighter Planet
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,27 @@
1
+ =remote_table
2
+
3
+ Remotely open and parse Excel XLS, ODS, CSV and fixed-width tables.
4
+
5
+ ==Real-life usage
6
+
7
+ Used by data_miner (http://github.com/seamusabshere/data_miner)
8
+
9
+ ==Example
10
+
11
+ Taken from <tt>#{GEMDIR}/test/remote_table_test.rb</tt>:
12
+
13
+ >> t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv')
14
+ => #<RemoteTable:0x359da50 @transform=#<RemoteTable::Transform:0x359d154 @select=nil, @reject=nil>, @file=#<RemoteTable::File:0x35970c4 @delimiter=nil, @headers=nil, @cut=nil, @filename="98guide6.csv", @skip=nil, @schema_name=nil, @crop=nil, @format=:csv, @trap=nil, @sheet=0, @schema=nil>, @package=#<RemoteTable::Package:0x359c538 @url="http://www.fueleconomy.gov/FEG/epadata/98guide6.zip", @filename="98guide6.csv", @compression=:zip, @packing=nil>, @request=#<RemoteTable::Request:0x3596bec @url="http://www.fueleconomy.gov/FEG/epadata/98guide6.zip">>
15
+ >> t.rows.first
16
+ => {"cyl"=>"6", "eng dscr"=>"DOHC VTEC", "trans dscr"=>"2MODE CLKUP", "trans"=>"Auto(L4)", "cmb"=>"20", "2pv"=>nil, "carline name"=>"NSX", "displ"=>"3.0", "ucmb"=>"23.5311", "hpv"=>nil, "4pv"=>nil, "Class"=>"TWO SEATERS", "Manufacturer"=>"ACURA", "fl"=>"P", "2lv"=>nil, "G"=>nil, "hlv"=>nil, "drv"=>"R", "cty"=>"18", "ucty"=>"19.8733", "S"=>nil, "4lv"=>nil, "fcost"=>"1050", "T"=>nil, "hwy"=>"24", "uhwy"=>"30.3612"}
17
+
18
+ See the test file and also data_miner examples of custom parsers.
19
+
20
+ ==Authors
21
+
22
+ * Seamus Abshere <seamus@abshere.net>
23
+ * Andy Rossmeissl <andy@rossmeissl.net>
24
+
25
+ == Copyright
26
+
27
+ Copyright (c) 2010 Brighter Planet. See LICENSE for details.
@@ -0,0 +1,70 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "remote_table"
8
+ gem.summary = %Q{Remotely open and parse XLS, ODS, CSV and fixed-width tables.}
9
+ gem.description = %Q{Remotely open and parse Excel XLS, ODS, CSV and fixed-width tables.}
10
+ gem.email = "seamus@abshere.net"
11
+ gem.homepage = "http://github.com/seamusabshere/remote_table"
12
+ gem.authors = ["Seamus Abshere", "Andy Rossmeissl"]
13
+ gem.add_dependency 'roo', '1.3.11' # 1.9.3 breaks
14
+ gem.add_dependency 'fastercsv', '>=1.5.0'
15
+ gem.add_dependency 'activesupport', '>=2.3.4'
16
+ gem.add_dependency 'slither', '>=0.99.3'
17
+ gem.add_dependency 'nokogiri', '>=1.4.1'
18
+ gem.add_dependency 'escape', '>=0.0.4'
19
+ gem.add_development_dependency 'errata', '>=0.2.0'
20
+ gem.require_path = "lib"
21
+ gem.rdoc_options << '--line-numbers' << '--inline-source'
22
+ gem.requirements << 'curl'
23
+ gem.rubyforge_project = "remotetable"
24
+ end
25
+ Jeweler::GemcutterTasks.new
26
+ Jeweler::RubyforgeTasks.new do |rubyforge|
27
+ rubyforge.doc_task = "rdoc"
28
+ end
29
+ rescue LoadError
30
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
31
+ end
32
+
33
+ require 'rake/testtask'
34
+ Rake::TestTask.new(:test) do |test|
35
+ test.libs << 'lib' << 'test'
36
+ test.pattern = 'test/**/*_test.rb'
37
+ test.verbose = true
38
+ end
39
+
40
+ begin
41
+ require 'rcov/rcovtask'
42
+ Rcov::RcovTask.new do |test|
43
+ test.libs << 'test'
44
+ test.pattern = 'test/**/*_test.rb'
45
+ test.verbose = true
46
+ end
47
+ rescue LoadError
48
+ task :rcov do
49
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
50
+ end
51
+ end
52
+
53
+
54
+
55
+
56
+ task :default => :test
57
+
58
+ require 'rake/rdoctask'
59
+ Rake::RDocTask.new do |rdoc|
60
+ if File.exist?('VERSION')
61
+ version = File.read('VERSION')
62
+ else
63
+ version = ""
64
+ end
65
+
66
+ rdoc.rdoc_dir = 'rdoc'
67
+ rdoc.title = "remote_table #{version}"
68
+ rdoc.rdoc_files.include('README*')
69
+ rdoc.rdoc_files.include('lib/**/*.rb')
70
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.30
@@ -0,0 +1,117 @@
1
+ require 'digest/md5'
2
+ require 'uri'
3
+ require 'tmpdir'
4
+ require 'active_support'
5
+ require 'active_support/version'
6
+ %w{
7
+ active_support/core_ext/string/conversions
8
+ active_support/core_ext/object/blank
9
+ active_support/core_ext/string/inflections
10
+ active_support/core_ext/array/wrap
11
+ active_support/core_ext/hash/except
12
+ active_support/core_ext/class/attribute_accessors
13
+ }.each do |active_support_3_requirement|
14
+ require active_support_3_requirement
15
+ end if ActiveSupport::VERSION::MAJOR == 3
16
+
17
+ require 'escape'
18
+ require 'slither'
19
+ require 'roo'
20
+ I_KNOW_I_AM_USING_AN_OLD_AND_BUGGY_VERSION_OF_LIBXML2 = true
21
+ require 'nokogiri'
22
+ require 'remote_table/transform'
23
+ require 'remote_table/request'
24
+ require 'remote_table/package'
25
+ require 'remote_table/file'
26
+ require 'remote_table/file/csv'
27
+ require 'remote_table/file/fixed_width'
28
+ require 'remote_table/file/roo_spreadsheet'
29
+ require 'remote_table/file/ods'
30
+ require 'remote_table/file/xls'
31
+ require 'remote_table/file/xlsx'
32
+ require 'remote_table/file/html'
33
+
34
+ class RemoteTable
35
+ cattr_accessor :paths_for_removal
36
+ class << self
37
+ def cleanup
38
+ paths_for_removal.each do |path|
39
+ FileUtils.rm_rf path
40
+ paths_for_removal.delete path
41
+ end if paths_for_removal.is_a?(Array)
42
+ end
43
+
44
+ def remove_at_exit(path)
45
+ self.paths_for_removal ||= Array.new
46
+ paths_for_removal.push path
47
+ end
48
+ end
49
+
50
+ attr_accessor :request, :package, :file, :transform
51
+ attr_accessor :table
52
+
53
+ include Enumerable
54
+
55
+ def initialize(bus)
56
+ @transform = Transform.new(bus)
57
+ @package = Package.new(bus)
58
+ @request = Request.new(bus)
59
+ @file = File.new(bus)
60
+ at_exit { RemoteTable.cleanup }
61
+ end
62
+
63
+ def each
64
+ finish_table! unless table
65
+ table.each_row { |row| yield row }
66
+ end
67
+ alias :each_row :each
68
+
69
+ def to_a
70
+ cache_rows! if @_row_cache.nil?
71
+ @_row_cache
72
+ end
73
+ alias :rows :to_a
74
+
75
+ def <=>(other)
76
+ raise "Not implemented"
77
+ end
78
+
79
+ protected
80
+
81
+ def self.bang(path, cmd)
82
+ tmp_path = "#{path}.tmp"
83
+ RemoteTable.backtick_with_reporting "cat #{Escape.shell_single_word path} | #{cmd} > #{Escape.shell_single_word tmp_path}"
84
+ FileUtils.mv tmp_path, path
85
+ end
86
+
87
+ # TODO this should probably live somewhere else
88
+ def self.backtick_with_reporting(cmd)
89
+ cmd = cmd.gsub /[ ]*\n[ ]*/m, ' '
90
+ output = `#{cmd}`
91
+ if not $?.success?
92
+ raise %{
93
+ From the remote_table gem...
94
+
95
+ Command failed:
96
+ #{cmd}
97
+
98
+ Output:
99
+ #{output}
100
+ }
101
+ end
102
+ end
103
+
104
+ private
105
+
106
+ def finish_table!
107
+ package_path = request.download
108
+ file_path = package.stage(package_path)
109
+ raw_table = file.tabulate(file_path)
110
+ self.table = transform.apply(raw_table) # must return something that responds to each_row
111
+ end
112
+
113
+ def cache_rows!
114
+ @_row_cache = []
115
+ each_row { |row| @_row_cache << row }
116
+ end
117
+ end
@@ -0,0 +1,100 @@
1
+ class RemoteTable
2
+ class File
3
+ attr_accessor :filename, :format, :delimiter, :skip, :cut, :crop, :sheet, :headers, :schema, :schema_name, :trap
4
+ attr_accessor :encoding
5
+ attr_accessor :path
6
+ attr_accessor :keep_blank_rows
7
+ attr_accessor :row_xpath
8
+ attr_accessor :column_xpath
9
+
10
+ def initialize(bus)
11
+ @filename = bus[:filename]
12
+ @format = bus[:format] || format_from_filename
13
+ @delimiter = bus[:delimiter]
14
+ @sheet = bus[:sheet] || 0
15
+ @skip = bus[:skip] # rows
16
+ @keep_blank_rows = bus[:keep_blank_rows] || false
17
+ @crop = bus[:crop] # rows
18
+ @cut = bus[:cut] # columns
19
+ @headers = bus[:headers]
20
+ @schema = bus[:schema]
21
+ @schema_name = bus[:schema_name]
22
+ @trap = bus[:trap]
23
+ @encoding = bus[:encoding] || 'UTF-8'
24
+ @row_xpath = bus[:row_xpath]
25
+ @column_xpath = bus[:column_xpath]
26
+ extend "RemoteTable::#{format.to_s.camelcase}".constantize
27
+ end
28
+
29
+ def tabulate(path)
30
+ define_fixed_width_schema! if format == :fixed_width and schema.is_a?(Array) # TODO move to generic subclass callback
31
+ self.path = path
32
+ self
33
+ end
34
+
35
+ private
36
+
37
+ # doesn't support trap
38
+ def define_fixed_width_schema!
39
+ raise "can't define both schema_name and schema" if !schema_name.blank?
40
+ self.schema_name = "autogenerated_#{filename.gsub(/[^a-z0-9_]/i, '')}".to_sym
41
+ self.trap ||= lambda { true }
42
+ Slither.define schema_name do |d|
43
+ d.rows do |row|
44
+ row.trap(&trap)
45
+ schema.each do |name, width, options|
46
+ if name == 'spacer'
47
+ row.spacer width
48
+ else
49
+ row.column name, width, options
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+
56
+ def backup_file!
57
+ FileUtils.cp path, "#{path}.backup"
58
+ end
59
+
60
+ def skip_rows!
61
+ return unless skip
62
+ RemoteTable.bang path, "tail -n +#{skip + 1}"
63
+ end
64
+
65
+ USELESS_CHARACTERS = [
66
+ '\xef\xbb\xbf', # UTF-8 byte order mark
67
+ '\xc2\xad' # soft hyphen, often inserted by MS Office (html: &shy;)
68
+ ]
69
+ def remove_useless_characters!
70
+ RemoteTable.bang path, "perl -pe 's/#{USELESS_CHARACTERS.join '//g; s/'}//g'"
71
+ end
72
+
73
+ def convert_file_to_utf8!
74
+ RemoteTable.bang path, "iconv -c -f #{Escape.shell_single_word encoding} -t UTF-8"
75
+ end
76
+
77
+ def restore_file!
78
+ FileUtils.mv "#{path}.backup", path if ::File.readable? "#{path}.backup"
79
+ end
80
+
81
+ def cut_columns!
82
+ return unless cut
83
+ RemoteTable.bang path, "cut -c #{Escape.shell_single_word cut.to_s}"
84
+ end
85
+
86
+ def crop_rows!
87
+ return unless crop
88
+ RemoteTable.bang path, "tail -n +#{Escape.shell_single_word crop.first.to_s} | head -n #{crop.last - crop.first + 1}"
89
+ end
90
+
91
+ def format_from_filename
92
+ extname = ::File.extname(filename).gsub('.', '')
93
+ return :csv if extname.blank?
94
+ format = [ :xls, :ods, :xlsx ].detect { |i| i == extname.to_sym }
95
+ format = :html if extname =~ /\Ahtm/
96
+ format = :csv if format.blank?
97
+ format
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,57 @@
1
+ if RUBY_VERSION >= "1.9"
2
+ require 'csv'
3
+ ::RemoteTable::MyCSV = ::CSV
4
+ else
5
+ require 'fastercsv'
6
+ ::RemoteTable::MyCSV = ::FasterCSV
7
+ end
8
+
9
+ class RemoteTable
10
+ module Csv
11
+ def each_row(&block)
12
+ backup_file!
13
+ convert_file_to_utf8!
14
+ remove_useless_characters!
15
+ skip_rows!
16
+ ::RemoteTable::MyCSV.foreach(path, fastercsv_options) do |row|
17
+ ordered_hash = ActiveSupport::OrderedHash.new
18
+ filled_values = 0
19
+ case row
20
+ when ::RemoteTable::MyCSV::Row
21
+ row.each do |header, value|
22
+ next if header.blank?
23
+ value = '' if value.nil?
24
+ ordered_hash[header] = value
25
+ filled_values += 1 if value.present?
26
+ end
27
+ when Array
28
+ index = 0
29
+ row.each do |value|
30
+ value = '' if value.nil?
31
+ ordered_hash[index] = value
32
+ filled_values += 1 if value.present?
33
+ index += 1
34
+ end
35
+ else
36
+ raise "Unexpected #{row.inspect}"
37
+ end
38
+ yield ordered_hash if keep_blank_rows or filled_values.nonzero?
39
+ end
40
+ ensure
41
+ restore_file!
42
+ end
43
+
44
+ private
45
+
46
+ def fastercsv_options
47
+ fastercsv_options = { :skip_blanks => !keep_blank_rows }
48
+ if headers == false
49
+ fastercsv_options.merge!(:headers => nil)
50
+ else
51
+ fastercsv_options.merge!(:headers => :first_row)
52
+ end
53
+ fastercsv_options.merge!(:col_sep => delimiter) if delimiter
54
+ fastercsv_options
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,19 @@
1
+ class RemoteTable
2
+ module FixedWidth
3
+ def each_row(&block)
4
+ backup_file!
5
+ convert_file_to_utf8!
6
+ remove_useless_characters!
7
+ crop_rows!
8
+ skip_rows!
9
+ cut_columns!
10
+ a = Slither.parse(path, schema_name)
11
+ a[:rows].each do |hash|
12
+ hash.reject! { |k, v| k.blank? }
13
+ yield hash if keep_blank_rows or hash.any? { |k, v| v.present? }
14
+ end
15
+ ensure
16
+ restore_file!
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,37 @@
1
+ class RemoteTable
2
+ module Html
3
+ def each_row(&block)
4
+ backup_file!
5
+ convert_file_to_utf8!
6
+ remove_useless_characters!
7
+ html_headers = (headers.is_a?(Array)) ? headers : nil
8
+ Nokogiri::HTML(unescaped_html_without_soft_hyphens, nil, 'UTF-8').xpath(row_xpath).each do |row|
9
+ values = row.xpath(column_xpath).map { |td| td.content.gsub(/\s+/, ' ').strip }
10
+ if html_headers.nil?
11
+ html_headers = values
12
+ next
13
+ end
14
+ hash = zip html_headers, values
15
+ yield hash if keep_blank_rows or hash.any? { |k, v| v.present? }
16
+ end
17
+ ensure
18
+ restore_file!
19
+ end
20
+
21
+ private
22
+
23
+ # http://snippets.dzone.com/posts/show/406
24
+ def zip(keys, values)
25
+ hash = Hash.new
26
+ keys.zip(values) { |k,v| hash[k]=v }
27
+ hash
28
+ end
29
+
30
+ # should we be doing this in ruby?
31
+ def unescaped_html_without_soft_hyphens
32
+ str = CGI.unescapeHTML IO.read(path)
33
+ str.gsub! /&shy;/, ''
34
+ str
35
+ end
36
+ end
37
+ end