remote_table-ruby19 0.2.30
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +5 -0
- data/CHANGELOG +3 -0
- data/LICENSE +20 -0
- data/README.rdoc +27 -0
- data/Rakefile +70 -0
- data/VERSION +1 -0
- data/lib/remote_table.rb +117 -0
- data/lib/remote_table/file.rb +100 -0
- data/lib/remote_table/file/csv.rb +57 -0
- data/lib/remote_table/file/fixed_width.rb +19 -0
- data/lib/remote_table/file/html.rb +37 -0
- data/lib/remote_table/file/ods.rb +11 -0
- data/lib/remote_table/file/roo_spreadsheet.rb +44 -0
- data/lib/remote_table/file/xls.rb +11 -0
- data/lib/remote_table/file/xlsx.rb +11 -0
- data/lib/remote_table/package.rb +89 -0
- data/lib/remote_table/request.rb +44 -0
- data/lib/remote_table/transform.rb +47 -0
- data/remote_table.gemspec +86 -0
- data/test/remote_table_test.rb +386 -0
- data/test/test_helper.rb +13 -0
- metadata +204 -0
data/.document
ADDED
data/CHANGELOG
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Brighter Planet
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
=remote_table
|
2
|
+
|
3
|
+
Remotely open and parse Excel XLS, ODS, CSV and fixed-width tables.
|
4
|
+
|
5
|
+
==Real-life usage
|
6
|
+
|
7
|
+
Used by data_miner (http://github.com/seamusabshere/data_miner)
|
8
|
+
|
9
|
+
==Example
|
10
|
+
|
11
|
+
Taken from <tt>#{GEMDIR}/test/remote_table_test.rb</tt>:
|
12
|
+
|
13
|
+
>> t = RemoteTable.new(:url => 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', :filename => '98guide6.csv')
|
14
|
+
=> #<RemoteTable:0x359da50 @transform=#<RemoteTable::Transform:0x359d154 @select=nil, @reject=nil>, @file=#<RemoteTable::File:0x35970c4 @delimiter=nil, @headers=nil, @cut=nil, @filename="98guide6.csv", @skip=nil, @schema_name=nil, @crop=nil, @format=:csv, @trap=nil, @sheet=0, @schema=nil>, @package=#<RemoteTable::Package:0x359c538 @url="http://www.fueleconomy.gov/FEG/epadata/98guide6.zip", @filename="98guide6.csv", @compression=:zip, @packing=nil>, @request=#<RemoteTable::Request:0x3596bec @url="http://www.fueleconomy.gov/FEG/epadata/98guide6.zip">>
|
15
|
+
>> t.rows.first
|
16
|
+
=> {"cyl"=>"6", "eng dscr"=>"DOHC VTEC", "trans dscr"=>"2MODE CLKUP", "trans"=>"Auto(L4)", "cmb"=>"20", "2pv"=>nil, "carline name"=>"NSX", "displ"=>"3.0", "ucmb"=>"23.5311", "hpv"=>nil, "4pv"=>nil, "Class"=>"TWO SEATERS", "Manufacturer"=>"ACURA", "fl"=>"P", "2lv"=>nil, "G"=>nil, "hlv"=>nil, "drv"=>"R", "cty"=>"18", "ucty"=>"19.8733", "S"=>nil, "4lv"=>nil, "fcost"=>"1050", "T"=>nil, "hwy"=>"24", "uhwy"=>"30.3612"}
|
17
|
+
|
18
|
+
See the test file and also data_miner examples of custom parsers.
|
19
|
+
|
20
|
+
==Authors
|
21
|
+
|
22
|
+
* Seamus Abshere <seamus@abshere.net>
|
23
|
+
* Andy Rossmeissl <andy@rossmeissl.net>
|
24
|
+
|
25
|
+
== Copyright
|
26
|
+
|
27
|
+
Copyright (c) 2010 Brighter Planet. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "remote_table"
|
8
|
+
gem.summary = %Q{Remotely open and parse XLS, ODS, CSV and fixed-width tables.}
|
9
|
+
gem.description = %Q{Remotely open and parse Excel XLS, ODS, CSV and fixed-width tables.}
|
10
|
+
gem.email = "seamus@abshere.net"
|
11
|
+
gem.homepage = "http://github.com/seamusabshere/remote_table"
|
12
|
+
gem.authors = ["Seamus Abshere", "Andy Rossmeissl"]
|
13
|
+
gem.add_dependency 'roo', '1.3.11' # 1.9.3 breaks
|
14
|
+
gem.add_dependency 'fastercsv', '>=1.5.0'
|
15
|
+
gem.add_dependency 'activesupport', '>=2.3.4'
|
16
|
+
gem.add_dependency 'slither', '>=0.99.3'
|
17
|
+
gem.add_dependency 'nokogiri', '>=1.4.1'
|
18
|
+
gem.add_dependency 'escape', '>=0.0.4'
|
19
|
+
gem.add_development_dependency 'errata', '>=0.2.0'
|
20
|
+
gem.require_path = "lib"
|
21
|
+
gem.rdoc_options << '--line-numbers' << '--inline-source'
|
22
|
+
gem.requirements << 'curl'
|
23
|
+
gem.rubyforge_project = "remotetable"
|
24
|
+
end
|
25
|
+
Jeweler::GemcutterTasks.new
|
26
|
+
Jeweler::RubyforgeTasks.new do |rubyforge|
|
27
|
+
rubyforge.doc_task = "rdoc"
|
28
|
+
end
|
29
|
+
rescue LoadError
|
30
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
31
|
+
end
|
32
|
+
|
33
|
+
require 'rake/testtask'
|
34
|
+
Rake::TestTask.new(:test) do |test|
|
35
|
+
test.libs << 'lib' << 'test'
|
36
|
+
test.pattern = 'test/**/*_test.rb'
|
37
|
+
test.verbose = true
|
38
|
+
end
|
39
|
+
|
40
|
+
begin
|
41
|
+
require 'rcov/rcovtask'
|
42
|
+
Rcov::RcovTask.new do |test|
|
43
|
+
test.libs << 'test'
|
44
|
+
test.pattern = 'test/**/*_test.rb'
|
45
|
+
test.verbose = true
|
46
|
+
end
|
47
|
+
rescue LoadError
|
48
|
+
task :rcov do
|
49
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
task :default => :test
|
57
|
+
|
58
|
+
require 'rake/rdoctask'
|
59
|
+
Rake::RDocTask.new do |rdoc|
|
60
|
+
if File.exist?('VERSION')
|
61
|
+
version = File.read('VERSION')
|
62
|
+
else
|
63
|
+
version = ""
|
64
|
+
end
|
65
|
+
|
66
|
+
rdoc.rdoc_dir = 'rdoc'
|
67
|
+
rdoc.title = "remote_table #{version}"
|
68
|
+
rdoc.rdoc_files.include('README*')
|
69
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
70
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.2.30
|
data/lib/remote_table.rb
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
require 'uri'
|
3
|
+
require 'tmpdir'
|
4
|
+
require 'active_support'
|
5
|
+
require 'active_support/version'
|
6
|
+
%w{
|
7
|
+
active_support/core_ext/string/conversions
|
8
|
+
active_support/core_ext/object/blank
|
9
|
+
active_support/core_ext/string/inflections
|
10
|
+
active_support/core_ext/array/wrap
|
11
|
+
active_support/core_ext/hash/except
|
12
|
+
active_support/core_ext/class/attribute_accessors
|
13
|
+
}.each do |active_support_3_requirement|
|
14
|
+
require active_support_3_requirement
|
15
|
+
end if ActiveSupport::VERSION::MAJOR == 3
|
16
|
+
|
17
|
+
require 'escape'
|
18
|
+
require 'slither'
|
19
|
+
require 'roo'
|
20
|
+
I_KNOW_I_AM_USING_AN_OLD_AND_BUGGY_VERSION_OF_LIBXML2 = true
|
21
|
+
require 'nokogiri'
|
22
|
+
require 'remote_table/transform'
|
23
|
+
require 'remote_table/request'
|
24
|
+
require 'remote_table/package'
|
25
|
+
require 'remote_table/file'
|
26
|
+
require 'remote_table/file/csv'
|
27
|
+
require 'remote_table/file/fixed_width'
|
28
|
+
require 'remote_table/file/roo_spreadsheet'
|
29
|
+
require 'remote_table/file/ods'
|
30
|
+
require 'remote_table/file/xls'
|
31
|
+
require 'remote_table/file/xlsx'
|
32
|
+
require 'remote_table/file/html'
|
33
|
+
|
34
|
+
class RemoteTable
|
35
|
+
cattr_accessor :paths_for_removal
|
36
|
+
class << self
|
37
|
+
def cleanup
|
38
|
+
paths_for_removal.each do |path|
|
39
|
+
FileUtils.rm_rf path
|
40
|
+
paths_for_removal.delete path
|
41
|
+
end if paths_for_removal.is_a?(Array)
|
42
|
+
end
|
43
|
+
|
44
|
+
def remove_at_exit(path)
|
45
|
+
self.paths_for_removal ||= Array.new
|
46
|
+
paths_for_removal.push path
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
attr_accessor :request, :package, :file, :transform
|
51
|
+
attr_accessor :table
|
52
|
+
|
53
|
+
include Enumerable
|
54
|
+
|
55
|
+
def initialize(bus)
|
56
|
+
@transform = Transform.new(bus)
|
57
|
+
@package = Package.new(bus)
|
58
|
+
@request = Request.new(bus)
|
59
|
+
@file = File.new(bus)
|
60
|
+
at_exit { RemoteTable.cleanup }
|
61
|
+
end
|
62
|
+
|
63
|
+
def each
|
64
|
+
finish_table! unless table
|
65
|
+
table.each_row { |row| yield row }
|
66
|
+
end
|
67
|
+
alias :each_row :each
|
68
|
+
|
69
|
+
def to_a
|
70
|
+
cache_rows! if @_row_cache.nil?
|
71
|
+
@_row_cache
|
72
|
+
end
|
73
|
+
alias :rows :to_a
|
74
|
+
|
75
|
+
def <=>(other)
|
76
|
+
raise "Not implemented"
|
77
|
+
end
|
78
|
+
|
79
|
+
protected
|
80
|
+
|
81
|
+
def self.bang(path, cmd)
|
82
|
+
tmp_path = "#{path}.tmp"
|
83
|
+
RemoteTable.backtick_with_reporting "cat #{Escape.shell_single_word path} | #{cmd} > #{Escape.shell_single_word tmp_path}"
|
84
|
+
FileUtils.mv tmp_path, path
|
85
|
+
end
|
86
|
+
|
87
|
+
# TODO this should probably live somewhere else
|
88
|
+
def self.backtick_with_reporting(cmd)
|
89
|
+
cmd = cmd.gsub /[ ]*\n[ ]*/m, ' '
|
90
|
+
output = `#{cmd}`
|
91
|
+
if not $?.success?
|
92
|
+
raise %{
|
93
|
+
From the remote_table gem...
|
94
|
+
|
95
|
+
Command failed:
|
96
|
+
#{cmd}
|
97
|
+
|
98
|
+
Output:
|
99
|
+
#{output}
|
100
|
+
}
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
|
106
|
+
def finish_table!
|
107
|
+
package_path = request.download
|
108
|
+
file_path = package.stage(package_path)
|
109
|
+
raw_table = file.tabulate(file_path)
|
110
|
+
self.table = transform.apply(raw_table) # must return something that responds to each_row
|
111
|
+
end
|
112
|
+
|
113
|
+
def cache_rows!
|
114
|
+
@_row_cache = []
|
115
|
+
each_row { |row| @_row_cache << row }
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
class RemoteTable
|
2
|
+
class File
|
3
|
+
attr_accessor :filename, :format, :delimiter, :skip, :cut, :crop, :sheet, :headers, :schema, :schema_name, :trap
|
4
|
+
attr_accessor :encoding
|
5
|
+
attr_accessor :path
|
6
|
+
attr_accessor :keep_blank_rows
|
7
|
+
attr_accessor :row_xpath
|
8
|
+
attr_accessor :column_xpath
|
9
|
+
|
10
|
+
def initialize(bus)
|
11
|
+
@filename = bus[:filename]
|
12
|
+
@format = bus[:format] || format_from_filename
|
13
|
+
@delimiter = bus[:delimiter]
|
14
|
+
@sheet = bus[:sheet] || 0
|
15
|
+
@skip = bus[:skip] # rows
|
16
|
+
@keep_blank_rows = bus[:keep_blank_rows] || false
|
17
|
+
@crop = bus[:crop] # rows
|
18
|
+
@cut = bus[:cut] # columns
|
19
|
+
@headers = bus[:headers]
|
20
|
+
@schema = bus[:schema]
|
21
|
+
@schema_name = bus[:schema_name]
|
22
|
+
@trap = bus[:trap]
|
23
|
+
@encoding = bus[:encoding] || 'UTF-8'
|
24
|
+
@row_xpath = bus[:row_xpath]
|
25
|
+
@column_xpath = bus[:column_xpath]
|
26
|
+
extend "RemoteTable::#{format.to_s.camelcase}".constantize
|
27
|
+
end
|
28
|
+
|
29
|
+
def tabulate(path)
|
30
|
+
define_fixed_width_schema! if format == :fixed_width and schema.is_a?(Array) # TODO move to generic subclass callback
|
31
|
+
self.path = path
|
32
|
+
self
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
# doesn't support trap
|
38
|
+
def define_fixed_width_schema!
|
39
|
+
raise "can't define both schema_name and schema" if !schema_name.blank?
|
40
|
+
self.schema_name = "autogenerated_#{filename.gsub(/[^a-z0-9_]/i, '')}".to_sym
|
41
|
+
self.trap ||= lambda { true }
|
42
|
+
Slither.define schema_name do |d|
|
43
|
+
d.rows do |row|
|
44
|
+
row.trap(&trap)
|
45
|
+
schema.each do |name, width, options|
|
46
|
+
if name == 'spacer'
|
47
|
+
row.spacer width
|
48
|
+
else
|
49
|
+
row.column name, width, options
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def backup_file!
|
57
|
+
FileUtils.cp path, "#{path}.backup"
|
58
|
+
end
|
59
|
+
|
60
|
+
def skip_rows!
|
61
|
+
return unless skip
|
62
|
+
RemoteTable.bang path, "tail -n +#{skip + 1}"
|
63
|
+
end
|
64
|
+
|
65
|
+
USELESS_CHARACTERS = [
|
66
|
+
'\xef\xbb\xbf', # UTF-8 byte order mark
|
67
|
+
'\xc2\xad' # soft hyphen, often inserted by MS Office (html: ­)
|
68
|
+
]
|
69
|
+
def remove_useless_characters!
|
70
|
+
RemoteTable.bang path, "perl -pe 's/#{USELESS_CHARACTERS.join '//g; s/'}//g'"
|
71
|
+
end
|
72
|
+
|
73
|
+
def convert_file_to_utf8!
|
74
|
+
RemoteTable.bang path, "iconv -c -f #{Escape.shell_single_word encoding} -t UTF-8"
|
75
|
+
end
|
76
|
+
|
77
|
+
def restore_file!
|
78
|
+
FileUtils.mv "#{path}.backup", path if ::File.readable? "#{path}.backup"
|
79
|
+
end
|
80
|
+
|
81
|
+
def cut_columns!
|
82
|
+
return unless cut
|
83
|
+
RemoteTable.bang path, "cut -c #{Escape.shell_single_word cut.to_s}"
|
84
|
+
end
|
85
|
+
|
86
|
+
def crop_rows!
|
87
|
+
return unless crop
|
88
|
+
RemoteTable.bang path, "tail -n +#{Escape.shell_single_word crop.first.to_s} | head -n #{crop.last - crop.first + 1}"
|
89
|
+
end
|
90
|
+
|
91
|
+
def format_from_filename
|
92
|
+
extname = ::File.extname(filename).gsub('.', '')
|
93
|
+
return :csv if extname.blank?
|
94
|
+
format = [ :xls, :ods, :xlsx ].detect { |i| i == extname.to_sym }
|
95
|
+
format = :html if extname =~ /\Ahtm/
|
96
|
+
format = :csv if format.blank?
|
97
|
+
format
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
if RUBY_VERSION >= "1.9"
|
2
|
+
require 'csv'
|
3
|
+
::RemoteTable::MyCSV = ::CSV
|
4
|
+
else
|
5
|
+
require 'fastercsv'
|
6
|
+
::RemoteTable::MyCSV = ::FasterCSV
|
7
|
+
end
|
8
|
+
|
9
|
+
class RemoteTable
|
10
|
+
module Csv
|
11
|
+
def each_row(&block)
|
12
|
+
backup_file!
|
13
|
+
convert_file_to_utf8!
|
14
|
+
remove_useless_characters!
|
15
|
+
skip_rows!
|
16
|
+
::RemoteTable::MyCSV.foreach(path, fastercsv_options) do |row|
|
17
|
+
ordered_hash = ActiveSupport::OrderedHash.new
|
18
|
+
filled_values = 0
|
19
|
+
case row
|
20
|
+
when ::RemoteTable::MyCSV::Row
|
21
|
+
row.each do |header, value|
|
22
|
+
next if header.blank?
|
23
|
+
value = '' if value.nil?
|
24
|
+
ordered_hash[header] = value
|
25
|
+
filled_values += 1 if value.present?
|
26
|
+
end
|
27
|
+
when Array
|
28
|
+
index = 0
|
29
|
+
row.each do |value|
|
30
|
+
value = '' if value.nil?
|
31
|
+
ordered_hash[index] = value
|
32
|
+
filled_values += 1 if value.present?
|
33
|
+
index += 1
|
34
|
+
end
|
35
|
+
else
|
36
|
+
raise "Unexpected #{row.inspect}"
|
37
|
+
end
|
38
|
+
yield ordered_hash if keep_blank_rows or filled_values.nonzero?
|
39
|
+
end
|
40
|
+
ensure
|
41
|
+
restore_file!
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def fastercsv_options
|
47
|
+
fastercsv_options = { :skip_blanks => !keep_blank_rows }
|
48
|
+
if headers == false
|
49
|
+
fastercsv_options.merge!(:headers => nil)
|
50
|
+
else
|
51
|
+
fastercsv_options.merge!(:headers => :first_row)
|
52
|
+
end
|
53
|
+
fastercsv_options.merge!(:col_sep => delimiter) if delimiter
|
54
|
+
fastercsv_options
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class RemoteTable
|
2
|
+
module FixedWidth
|
3
|
+
def each_row(&block)
|
4
|
+
backup_file!
|
5
|
+
convert_file_to_utf8!
|
6
|
+
remove_useless_characters!
|
7
|
+
crop_rows!
|
8
|
+
skip_rows!
|
9
|
+
cut_columns!
|
10
|
+
a = Slither.parse(path, schema_name)
|
11
|
+
a[:rows].each do |hash|
|
12
|
+
hash.reject! { |k, v| k.blank? }
|
13
|
+
yield hash if keep_blank_rows or hash.any? { |k, v| v.present? }
|
14
|
+
end
|
15
|
+
ensure
|
16
|
+
restore_file!
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
class RemoteTable
|
2
|
+
module Html
|
3
|
+
def each_row(&block)
|
4
|
+
backup_file!
|
5
|
+
convert_file_to_utf8!
|
6
|
+
remove_useless_characters!
|
7
|
+
html_headers = (headers.is_a?(Array)) ? headers : nil
|
8
|
+
Nokogiri::HTML(unescaped_html_without_soft_hyphens, nil, 'UTF-8').xpath(row_xpath).each do |row|
|
9
|
+
values = row.xpath(column_xpath).map { |td| td.content.gsub(/\s+/, ' ').strip }
|
10
|
+
if html_headers.nil?
|
11
|
+
html_headers = values
|
12
|
+
next
|
13
|
+
end
|
14
|
+
hash = zip html_headers, values
|
15
|
+
yield hash if keep_blank_rows or hash.any? { |k, v| v.present? }
|
16
|
+
end
|
17
|
+
ensure
|
18
|
+
restore_file!
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# http://snippets.dzone.com/posts/show/406
|
24
|
+
def zip(keys, values)
|
25
|
+
hash = Hash.new
|
26
|
+
keys.zip(values) { |k,v| hash[k]=v }
|
27
|
+
hash
|
28
|
+
end
|
29
|
+
|
30
|
+
# should we be doing this in ruby?
|
31
|
+
def unescaped_html_without_soft_hyphens
|
32
|
+
str = CGI.unescapeHTML IO.read(path)
|
33
|
+
str.gsub! /­/, ''
|
34
|
+
str
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|