remote_table-ruby19 0.2.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +5 -0
- data/CHANGELOG +3 -0
- data/LICENSE +20 -0
- data/README.rdoc +27 -0
- data/Rakefile +70 -0
- data/VERSION +1 -0
- data/lib/remote_table.rb +117 -0
- data/lib/remote_table/file.rb +100 -0
- data/lib/remote_table/file/csv.rb +57 -0
- data/lib/remote_table/file/fixed_width.rb +19 -0
- data/lib/remote_table/file/html.rb +37 -0
- data/lib/remote_table/file/ods.rb +11 -0
- data/lib/remote_table/file/roo_spreadsheet.rb +44 -0
- data/lib/remote_table/file/xls.rb +11 -0
- data/lib/remote_table/file/xlsx.rb +11 -0
- data/lib/remote_table/package.rb +89 -0
- data/lib/remote_table/request.rb +44 -0
- data/lib/remote_table/transform.rb +47 -0
- data/remote_table.gemspec +86 -0
- data/test/remote_table_test.rb +386 -0
- data/test/test_helper.rb +13 -0
- metadata +204 -0
data/.document
ADDED
data/CHANGELOG
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Brighter Planet
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
=remote_table
|
2
|
+
|
3
|
+
Remotely open and parse Excel XLS, ODS, CSV and fixed-width tables.
|
4
|
+
|
5
|
+
==Real-life usage
|
6
|
+
|
7
|
+
Used by data_miner (http://github.com/seamusabshere/data_miner)
|
8
|
+
|
9
|
+
==Example
|
10
|
+
|
11
|
+
Taken from <tt>#{GEMDIR}/test/remote_table_test.rb</tt>:
|
12
|
+
|
13
|
+
>> t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv')
|
14
|
+
=> #<RemoteTable:0x359da50 @transform=#<RemoteTable::Transform:0x359d154 @select=nil, @reject=nil>, @file=#<RemoteTable::File:0x35970c4 @delimiter=nil, @headers=nil, @cut=nil, @filename="98guide6.csv", @skip=nil, @schema_name=nil, @crop=nil, @format=:csv, @trap=nil, @sheet=0, @schema=nil>, @package=#<RemoteTable::Package:0x359c538 @url="http://www.fueleconomy.gov/FEG/epadata/98guide6.zip", @filename="98guide6.csv", @compression=:zip, @packing=nil>, @request=#<RemoteTable::Request:0x3596bec @url="http://www.fueleconomy.gov/FEG/epadata/98guide6.zip">>
|
15
|
+
>> t.rows.first
|
16
|
+
=> {"cyl"=>"6", "eng dscr"=>"DOHC VTEC", "trans dscr"=>"2MODE CLKUP", "trans"=>"Auto(L4)", "cmb"=>"20", "2pv"=>nil, "carline name"=>"NSX", "displ"=>"3.0", "ucmb"=>"23.5311", "hpv"=>nil, "4pv"=>nil, "Class"=>"TWO SEATERS", "Manufacturer"=>"ACURA", "fl"=>"P", "2lv"=>nil, "G"=>nil, "hlv"=>nil, "drv"=>"R", "cty"=>"18", "ucty"=>"19.8733", "S"=>nil, "4lv"=>nil, "fcost"=>"1050", "T"=>nil, "hwy"=>"24", "uhwy"=>"30.3612"}
|
17
|
+
|
18
|
+
See the test file and also data_miner examples of custom parsers.
|
19
|
+
|
20
|
+
==Authors
|
21
|
+
|
22
|
+
* Seamus Abshere <seamus@abshere.net>
|
23
|
+
* Andy Rossmeissl <andy@rossmeissl.net>
|
24
|
+
|
25
|
+
== Copyright
|
26
|
+
|
27
|
+
Copyright (c) 2010 Brighter Planet. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "remote_table"
|
8
|
+
gem.summary = %Q{Remotely open and parse XLS, ODS, CSV and fixed-width tables.}
|
9
|
+
gem.description = %Q{Remotely open and parse Excel XLS, ODS, CSV and fixed-width tables.}
|
10
|
+
gem.email = "seamus@abshere.net"
|
11
|
+
gem.homepage = "http://github.com/seamusabshere/remote_table"
|
12
|
+
gem.authors = ["Seamus Abshere", "Andy Rossmeissl"]
|
13
|
+
gem.add_dependency 'roo', '1.3.11' # 1.9.3 breaks
|
14
|
+
gem.add_dependency 'fastercsv', '>=1.5.0'
|
15
|
+
gem.add_dependency 'activesupport', '>=2.3.4'
|
16
|
+
gem.add_dependency 'slither', '>=0.99.3'
|
17
|
+
gem.add_dependency 'nokogiri', '>=1.4.1'
|
18
|
+
gem.add_dependency 'escape', '>=0.0.4'
|
19
|
+
gem.add_development_dependency 'errata', '>=0.2.0'
|
20
|
+
gem.require_path = "lib"
|
21
|
+
gem.rdoc_options << '--line-numbers' << '--inline-source'
|
22
|
+
gem.requirements << 'curl'
|
23
|
+
gem.rubyforge_project = "remotetable"
|
24
|
+
end
|
25
|
+
Jeweler::GemcutterTasks.new
|
26
|
+
Jeweler::RubyforgeTasks.new do |rubyforge|
|
27
|
+
rubyforge.doc_task = "rdoc"
|
28
|
+
end
|
29
|
+
rescue LoadError
|
30
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
31
|
+
end
|
32
|
+
|
33
|
+
require 'rake/testtask'
|
34
|
+
Rake::TestTask.new(:test) do |test|
|
35
|
+
test.libs << 'lib' << 'test'
|
36
|
+
test.pattern = 'test/**/*_test.rb'
|
37
|
+
test.verbose = true
|
38
|
+
end
|
39
|
+
|
40
|
+
begin
|
41
|
+
require 'rcov/rcovtask'
|
42
|
+
Rcov::RcovTask.new do |test|
|
43
|
+
test.libs << 'test'
|
44
|
+
test.pattern = 'test/**/*_test.rb'
|
45
|
+
test.verbose = true
|
46
|
+
end
|
47
|
+
rescue LoadError
|
48
|
+
task :rcov do
|
49
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
task :default => :test
|
57
|
+
|
58
|
+
require 'rake/rdoctask'
|
59
|
+
Rake::RDocTask.new do |rdoc|
|
60
|
+
if File.exist?('VERSION')
|
61
|
+
version = File.read('VERSION')
|
62
|
+
else
|
63
|
+
version = ""
|
64
|
+
end
|
65
|
+
|
66
|
+
rdoc.rdoc_dir = 'rdoc'
|
67
|
+
rdoc.title = "remote_table #{version}"
|
68
|
+
rdoc.rdoc_files.include('README*')
|
69
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
70
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.2.30
|
data/lib/remote_table.rb
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
require 'uri'
|
3
|
+
require 'tmpdir'
|
4
|
+
require 'active_support'
|
5
|
+
require 'active_support/version'
|
6
|
+
%w{
|
7
|
+
active_support/core_ext/string/conversions
|
8
|
+
active_support/core_ext/object/blank
|
9
|
+
active_support/core_ext/string/inflections
|
10
|
+
active_support/core_ext/array/wrap
|
11
|
+
active_support/core_ext/hash/except
|
12
|
+
active_support/core_ext/class/attribute_accessors
|
13
|
+
}.each do |active_support_3_requirement|
|
14
|
+
require active_support_3_requirement
|
15
|
+
end if ActiveSupport::VERSION::MAJOR == 3
|
16
|
+
|
17
|
+
require 'escape'
|
18
|
+
require 'slither'
|
19
|
+
require 'roo'
|
20
|
+
I_KNOW_I_AM_USING_AN_OLD_AND_BUGGY_VERSION_OF_LIBXML2 = true
|
21
|
+
require 'nokogiri'
|
22
|
+
require 'remote_table/transform'
|
23
|
+
require 'remote_table/request'
|
24
|
+
require 'remote_table/package'
|
25
|
+
require 'remote_table/file'
|
26
|
+
require 'remote_table/file/csv'
|
27
|
+
require 'remote_table/file/fixed_width'
|
28
|
+
require 'remote_table/file/roo_spreadsheet'
|
29
|
+
require 'remote_table/file/ods'
|
30
|
+
require 'remote_table/file/xls'
|
31
|
+
require 'remote_table/file/xlsx'
|
32
|
+
require 'remote_table/file/html'
|
33
|
+
|
34
|
+
class RemoteTable
|
35
|
+
cattr_accessor :paths_for_removal
|
36
|
+
class << self
|
37
|
+
def cleanup
|
38
|
+
paths_for_removal.each do |path|
|
39
|
+
FileUtils.rm_rf path
|
40
|
+
paths_for_removal.delete path
|
41
|
+
end if paths_for_removal.is_a?(Array)
|
42
|
+
end
|
43
|
+
|
44
|
+
def remove_at_exit(path)
|
45
|
+
self.paths_for_removal ||= Array.new
|
46
|
+
paths_for_removal.push path
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
attr_accessor :request, :package, :file, :transform
|
51
|
+
attr_accessor :table
|
52
|
+
|
53
|
+
include Enumerable
|
54
|
+
|
55
|
+
def initialize(bus)
|
56
|
+
@transform = Transform.new(bus)
|
57
|
+
@package = Package.new(bus)
|
58
|
+
@request = Request.new(bus)
|
59
|
+
@file = File.new(bus)
|
60
|
+
at_exit { RemoteTable.cleanup }
|
61
|
+
end
|
62
|
+
|
63
|
+
def each
|
64
|
+
finish_table! unless table
|
65
|
+
table.each_row { |row| yield row }
|
66
|
+
end
|
67
|
+
alias :each_row :each
|
68
|
+
|
69
|
+
def to_a
|
70
|
+
cache_rows! if @_row_cache.nil?
|
71
|
+
@_row_cache
|
72
|
+
end
|
73
|
+
alias :rows :to_a
|
74
|
+
|
75
|
+
def <=>(other)
|
76
|
+
raise "Not implemented"
|
77
|
+
end
|
78
|
+
|
79
|
+
protected
|
80
|
+
|
81
|
+
def self.bang(path, cmd)
|
82
|
+
tmp_path = "#{path}.tmp"
|
83
|
+
RemoteTable.backtick_with_reporting "cat #{Escape.shell_single_word path} | #{cmd} > #{Escape.shell_single_word tmp_path}"
|
84
|
+
FileUtils.mv tmp_path, path
|
85
|
+
end
|
86
|
+
|
87
|
+
# TODO this should probably live somewhere else
|
88
|
+
def self.backtick_with_reporting(cmd)
|
89
|
+
cmd = cmd.gsub /[ ]*\n[ ]*/m, ' '
|
90
|
+
output = `#{cmd}`
|
91
|
+
if not $?.success?
|
92
|
+
raise %{
|
93
|
+
From the remote_table gem...
|
94
|
+
|
95
|
+
Command failed:
|
96
|
+
#{cmd}
|
97
|
+
|
98
|
+
Output:
|
99
|
+
#{output}
|
100
|
+
}
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
|
106
|
+
def finish_table!
|
107
|
+
package_path = request.download
|
108
|
+
file_path = package.stage(package_path)
|
109
|
+
raw_table = file.tabulate(file_path)
|
110
|
+
self.table = transform.apply(raw_table) # must return something that responds to each_row
|
111
|
+
end
|
112
|
+
|
113
|
+
def cache_rows!
|
114
|
+
@_row_cache = []
|
115
|
+
each_row { |row| @_row_cache << row }
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
class RemoteTable
|
2
|
+
class File
|
3
|
+
attr_accessor :filename, :format, :delimiter, :skip, :cut, :crop, :sheet, :headers, :schema, :schema_name, :trap
|
4
|
+
attr_accessor :encoding
|
5
|
+
attr_accessor :path
|
6
|
+
attr_accessor :keep_blank_rows
|
7
|
+
attr_accessor :row_xpath
|
8
|
+
attr_accessor :column_xpath
|
9
|
+
|
10
|
+
def initialize(bus)
|
11
|
+
@filename = bus[:filename]
|
12
|
+
@format = bus[:format] || format_from_filename
|
13
|
+
@delimiter = bus[:delimiter]
|
14
|
+
@sheet = bus[:sheet] || 0
|
15
|
+
@skip = bus[:skip] # rows
|
16
|
+
@keep_blank_rows = bus[:keep_blank_rows] || false
|
17
|
+
@crop = bus[:crop] # rows
|
18
|
+
@cut = bus[:cut] # columns
|
19
|
+
@headers = bus[:headers]
|
20
|
+
@schema = bus[:schema]
|
21
|
+
@schema_name = bus[:schema_name]
|
22
|
+
@trap = bus[:trap]
|
23
|
+
@encoding = bus[:encoding] || 'UTF-8'
|
24
|
+
@row_xpath = bus[:row_xpath]
|
25
|
+
@column_xpath = bus[:column_xpath]
|
26
|
+
extend "RemoteTable::#{format.to_s.camelcase}".constantize
|
27
|
+
end
|
28
|
+
|
29
|
+
def tabulate(path)
|
30
|
+
define_fixed_width_schema! if format == :fixed_width and schema.is_a?(Array) # TODO move to generic subclass callback
|
31
|
+
self.path = path
|
32
|
+
self
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
# doesn't support trap
|
38
|
+
def define_fixed_width_schema!
|
39
|
+
raise "can't define both schema_name and schema" if !schema_name.blank?
|
40
|
+
self.schema_name = "autogenerated_#{filename.gsub(/[^a-z0-9_]/i, '')}".to_sym
|
41
|
+
self.trap ||= lambda { true }
|
42
|
+
Slither.define schema_name do |d|
|
43
|
+
d.rows do |row|
|
44
|
+
row.trap(&trap)
|
45
|
+
schema.each do |name, width, options|
|
46
|
+
if name == 'spacer'
|
47
|
+
row.spacer width
|
48
|
+
else
|
49
|
+
row.column name, width, options
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def backup_file!
|
57
|
+
FileUtils.cp path, "#{path}.backup"
|
58
|
+
end
|
59
|
+
|
60
|
+
def skip_rows!
|
61
|
+
return unless skip
|
62
|
+
RemoteTable.bang path, "tail -n +#{skip + 1}"
|
63
|
+
end
|
64
|
+
|
65
|
+
USELESS_CHARACTERS = [
|
66
|
+
'\xef\xbb\xbf', # UTF-8 byte order mark
|
67
|
+
'\xc2\xad' # soft hyphen, often inserted by MS Office (html: ­)
|
68
|
+
]
|
69
|
+
def remove_useless_characters!
|
70
|
+
RemoteTable.bang path, "perl -pe 's/#{USELESS_CHARACTERS.join '//g; s/'}//g'"
|
71
|
+
end
|
72
|
+
|
73
|
+
def convert_file_to_utf8!
|
74
|
+
RemoteTable.bang path, "iconv -c -f #{Escape.shell_single_word encoding} -t UTF-8"
|
75
|
+
end
|
76
|
+
|
77
|
+
def restore_file!
|
78
|
+
FileUtils.mv "#{path}.backup", path if ::File.readable? "#{path}.backup"
|
79
|
+
end
|
80
|
+
|
81
|
+
def cut_columns!
|
82
|
+
return unless cut
|
83
|
+
RemoteTable.bang path, "cut -c #{Escape.shell_single_word cut.to_s}"
|
84
|
+
end
|
85
|
+
|
86
|
+
def crop_rows!
|
87
|
+
return unless crop
|
88
|
+
RemoteTable.bang path, "tail -n +#{Escape.shell_single_word crop.first.to_s} | head -n #{crop.last - crop.first + 1}"
|
89
|
+
end
|
90
|
+
|
91
|
+
def format_from_filename
|
92
|
+
extname = ::File.extname(filename).gsub('.', '')
|
93
|
+
return :csv if extname.blank?
|
94
|
+
format = [ :xls, :ods, :xlsx ].detect { |i| i == extname.to_sym }
|
95
|
+
format = :html if extname =~ /\Ahtm/
|
96
|
+
format = :csv if format.blank?
|
97
|
+
format
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
if RUBY_VERSION >= "1.9"
|
2
|
+
require 'csv'
|
3
|
+
::RemoteTable::MyCSV = ::CSV
|
4
|
+
else
|
5
|
+
require 'fastercsv'
|
6
|
+
::RemoteTable::MyCSV = ::FasterCSV
|
7
|
+
end
|
8
|
+
|
9
|
+
class RemoteTable
|
10
|
+
module Csv
|
11
|
+
def each_row(&block)
|
12
|
+
backup_file!
|
13
|
+
convert_file_to_utf8!
|
14
|
+
remove_useless_characters!
|
15
|
+
skip_rows!
|
16
|
+
::RemoteTable::MyCSV.foreach(path, fastercsv_options) do |row|
|
17
|
+
ordered_hash = ActiveSupport::OrderedHash.new
|
18
|
+
filled_values = 0
|
19
|
+
case row
|
20
|
+
when ::RemoteTable::MyCSV::Row
|
21
|
+
row.each do |header, value|
|
22
|
+
next if header.blank?
|
23
|
+
value = '' if value.nil?
|
24
|
+
ordered_hash[header] = value
|
25
|
+
filled_values += 1 if value.present?
|
26
|
+
end
|
27
|
+
when Array
|
28
|
+
index = 0
|
29
|
+
row.each do |value|
|
30
|
+
value = '' if value.nil?
|
31
|
+
ordered_hash[index] = value
|
32
|
+
filled_values += 1 if value.present?
|
33
|
+
index += 1
|
34
|
+
end
|
35
|
+
else
|
36
|
+
raise "Unexpected #{row.inspect}"
|
37
|
+
end
|
38
|
+
yield ordered_hash if keep_blank_rows or filled_values.nonzero?
|
39
|
+
end
|
40
|
+
ensure
|
41
|
+
restore_file!
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def fastercsv_options
|
47
|
+
fastercsv_options = { :skip_blanks => !keep_blank_rows }
|
48
|
+
if headers == false
|
49
|
+
fastercsv_options.merge!(:headers => nil)
|
50
|
+
else
|
51
|
+
fastercsv_options.merge!(:headers => :first_row)
|
52
|
+
end
|
53
|
+
fastercsv_options.merge!(:col_sep => delimiter) if delimiter
|
54
|
+
fastercsv_options
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class RemoteTable
|
2
|
+
module FixedWidth
|
3
|
+
def each_row(&block)
|
4
|
+
backup_file!
|
5
|
+
convert_file_to_utf8!
|
6
|
+
remove_useless_characters!
|
7
|
+
crop_rows!
|
8
|
+
skip_rows!
|
9
|
+
cut_columns!
|
10
|
+
a = Slither.parse(path, schema_name)
|
11
|
+
a[:rows].each do |hash|
|
12
|
+
hash.reject! { |k, v| k.blank? }
|
13
|
+
yield hash if keep_blank_rows or hash.any? { |k, v| v.present? }
|
14
|
+
end
|
15
|
+
ensure
|
16
|
+
restore_file!
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
class RemoteTable
|
2
|
+
module Html
|
3
|
+
def each_row(&block)
|
4
|
+
backup_file!
|
5
|
+
convert_file_to_utf8!
|
6
|
+
remove_useless_characters!
|
7
|
+
html_headers = (headers.is_a?(Array)) ? headers : nil
|
8
|
+
Nokogiri::HTML(unescaped_html_without_soft_hyphens, nil, 'UTF-8').xpath(row_xpath).each do |row|
|
9
|
+
values = row.xpath(column_xpath).map { |td| td.content.gsub(/\s+/, ' ').strip }
|
10
|
+
if html_headers.nil?
|
11
|
+
html_headers = values
|
12
|
+
next
|
13
|
+
end
|
14
|
+
hash = zip html_headers, values
|
15
|
+
yield hash if keep_blank_rows or hash.any? { |k, v| v.present? }
|
16
|
+
end
|
17
|
+
ensure
|
18
|
+
restore_file!
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# http://snippets.dzone.com/posts/show/406
|
24
|
+
def zip(keys, values)
|
25
|
+
hash = Hash.new
|
26
|
+
keys.zip(values) { |k,v| hash[k]=v }
|
27
|
+
hash
|
28
|
+
end
|
29
|
+
|
30
|
+
# should we be doing this in ruby?
|
31
|
+
def unescaped_html_without_soft_hyphens
|
32
|
+
str = CGI.unescapeHTML IO.read(path)
|
33
|
+
str.gsub! /­/, ''
|
34
|
+
str
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|