remote_table 0.2.32 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +5 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +65 -0
- data/LICENSE +1 -1
- data/README.rdoc +21 -7
- data/Rakefile +12 -61
- data/lib/remote_table/cleaner.rb +19 -0
- data/lib/remote_table/executor.rb +29 -0
- data/lib/remote_table/format/delimited.rb +62 -0
- data/lib/remote_table/format/excel.rb +10 -0
- data/lib/remote_table/format/excelx.rb +10 -0
- data/lib/remote_table/format/fixed_width.rb +47 -0
- data/lib/remote_table/format/html.rb +43 -0
- data/lib/remote_table/format/mixins/rooable.rb +47 -0
- data/lib/remote_table/format/mixins/textual.rb +34 -0
- data/lib/remote_table/format/open_office.rb +10 -0
- data/lib/remote_table/format.rb +35 -0
- data/lib/remote_table/hasher.rb +25 -0
- data/lib/remote_table/local_file.rb +92 -0
- data/lib/remote_table/properties.rb +209 -0
- data/lib/remote_table/transformer.rb +17 -0
- data/lib/remote_table/version.rb +3 -0
- data/lib/remote_table.rb +91 -99
- data/remote_table.gemspec +32 -77
- data/test/{test_helper.rb → helper.rb} +9 -2
- data/test/test_big.rb +61 -0
- data/test/test_errata.rb +46 -0
- data/test/test_old_syntax.rb +229 -0
- data/test/test_old_transform.rb +49 -0
- data/test/test_remote_table.rb +13 -0
- metadata +176 -53
- data/VERSION +0 -1
- data/lib/remote_table/file/csv.rb +0 -49
- data/lib/remote_table/file/fixed_width.rb +0 -19
- data/lib/remote_table/file/html.rb +0 -37
- data/lib/remote_table/file/ods.rb +0 -11
- data/lib/remote_table/file/roo_spreadsheet.rb +0 -44
- data/lib/remote_table/file/xls.rb +0 -11
- data/lib/remote_table/file/xlsx.rb +0 -11
- data/lib/remote_table/file.rb +0 -100
- data/lib/remote_table/package.rb +0 -89
- data/lib/remote_table/request.rb +0 -44
- data/lib/remote_table/transform.rb +0 -58
- data/test/remote_table_test.rb +0 -386
data/CHANGELOG
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
1.0.0
|
2
|
+
* Refactored to follow more Ruby conventions
|
3
|
+
* Suggesting new syntax that looks more like an Enumerable... t[5] instead of t.rows[5]
|
4
|
+
* Switching to string option keys (but old syntax is supported)
|
5
|
+
[...no changelog for 0.1.6--1.0.0...sorry]
|
1
6
|
0.1.6
|
2
7
|
* For CSVs, force convert headers using String#toutf8. :encoding => 'N'|'U' didn't work.
|
3
8
|
* Fix handling of long urls when passing off to Tempfile.
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
remote_table (1.0.0)
|
5
|
+
activesupport (>= 2.3.4)
|
6
|
+
builder
|
7
|
+
escape (>= 0.0.4)
|
8
|
+
google-spreadsheet-ruby
|
9
|
+
i18n
|
10
|
+
nokogiri (>= 1.4.1)
|
11
|
+
roo (~> 1.9)
|
12
|
+
slither (>= 0.99.4)
|
13
|
+
spreadsheet
|
14
|
+
zip
|
15
|
+
|
16
|
+
GEM
|
17
|
+
remote: http://rubygems.org/
|
18
|
+
specs:
|
19
|
+
activesupport (3.0.3)
|
20
|
+
builder (3.0.0)
|
21
|
+
columnize (0.3.2)
|
22
|
+
errata (0.2.4)
|
23
|
+
activesupport (>= 2.3.4)
|
24
|
+
remote_table (>= 0.2.31)
|
25
|
+
escape (0.0.4)
|
26
|
+
google-spreadsheet-ruby (0.1.2)
|
27
|
+
nokogiri (>= 1.4.3.1)
|
28
|
+
oauth (>= 0.3.6)
|
29
|
+
i18n (0.5.0)
|
30
|
+
linecache (0.43)
|
31
|
+
nokogiri (1.4.3.1)
|
32
|
+
oauth (0.4.4)
|
33
|
+
roo (1.9.3)
|
34
|
+
ruby-debug (0.10.4)
|
35
|
+
columnize (>= 0.1)
|
36
|
+
ruby-debug-base (~> 0.10.4.0)
|
37
|
+
ruby-debug-base (0.10.4)
|
38
|
+
linecache (>= 0.3)
|
39
|
+
ruby-ole (1.2.10.1)
|
40
|
+
shoulda (2.10.3)
|
41
|
+
slither (0.99.4)
|
42
|
+
spreadsheet (0.6.4.1)
|
43
|
+
ruby-ole
|
44
|
+
test-unit (2.1.2)
|
45
|
+
zip (2.0.2)
|
46
|
+
|
47
|
+
PLATFORMS
|
48
|
+
ruby
|
49
|
+
|
50
|
+
DEPENDENCIES
|
51
|
+
activesupport (>= 2.3.4)
|
52
|
+
builder
|
53
|
+
errata (>= 0.2.0)
|
54
|
+
escape (>= 0.0.4)
|
55
|
+
google-spreadsheet-ruby
|
56
|
+
i18n
|
57
|
+
nokogiri (>= 1.4.1)
|
58
|
+
remote_table!
|
59
|
+
roo (~> 1.9)
|
60
|
+
ruby-debug
|
61
|
+
shoulda
|
62
|
+
slither (>= 0.99.4)
|
63
|
+
spreadsheet
|
64
|
+
test-unit
|
65
|
+
zip
|
data/LICENSE
CHANGED
data/README.rdoc
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
=remote_table
|
2
2
|
|
3
|
-
|
3
|
+
Open local or remote XLSX, XLS, ODS, CSV and fixed-width files.
|
4
4
|
|
5
5
|
==Real-life usage
|
6
6
|
|
@@ -8,15 +8,29 @@ Used by data_miner (http://github.com/seamusabshere/data_miner)
|
|
8
8
|
|
9
9
|
==Example
|
10
10
|
|
11
|
-
Taken from <tt>#{GEMDIR}/test/
|
11
|
+
Taken from <tt>#{GEMDIR}/test/test_remote_table.rb</tt>:
|
12
12
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
should "open an XLSX" do
|
14
|
+
t = RemoteTable.new 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx'
|
15
|
+
assert_equal "Secure encryption of all data", t[5]["Requirements"]
|
16
|
+
end
|
17
|
+
|
18
|
+
or on the console
|
19
|
+
|
20
|
+
?> t = RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', 'filename' => '98guide6.csv'
|
21
|
+
=> #<RemoteTable:0x359da50 [...]>
|
22
|
+
?> t[0]
|
23
|
+
=> {"cyl"=>"6", "eng dscr"=>"DOHC VTEC", "trans dscr"=>"2MODE CLKUP", "trans"=>"Auto(L4)", "cmb"=>"20", "2pv"=>nil, "carline name"=>"NSX", "displ"=>"3.0", "ucmb"=>"23.5311", "hpv"=>nil, "4pv"=>nil, "Class"=>"TWO SEATERS", "Manufacturer"=>"ACURA", "fl"=>"P", "2lv"=>nil, "G"=>nil, "hlv"=>nil, "drv"=>"R", "cty"=>"18", "ucty"=>"19.8733", "S"=>nil, "4lv"=>nil, "fcost"=>"1050", "T"=>nil, "hwy"=>"24", "uhwy"=>"30.3612"}
|
17
24
|
|
18
25
|
See the test file and also data_miner examples of custom parsers.
|
19
26
|
|
27
|
+
==Wishlist
|
28
|
+
|
29
|
+
* The new parser syntax (aka transformer) hasn't been defined yet... only the old-style syntax is available
|
30
|
+
* We currently call curl (and a lot of other utilities) using a shell. Is there a safer way to do this?
|
31
|
+
* Row hashes may come out differently for Ruby 1.8 and Ruby 1.9, which ruins the whole purpose.
|
32
|
+
* Since <tt>Enumerable</tt> provides <tt>#to_a</tt>, I'm not sure if it's caching the row loading.
|
33
|
+
|
20
34
|
==Authors
|
21
35
|
|
22
36
|
* Seamus Abshere <seamus@abshere.net>
|
@@ -24,4 +38,4 @@ See the test file and also data_miner examples of custom parsers.
|
|
24
38
|
|
25
39
|
== Copyright
|
26
40
|
|
27
|
-
Copyright (c)
|
41
|
+
Copyright (c) 2011 Brighter Planet. See LICENSE for details.
|
data/Rakefile
CHANGED
@@ -1,72 +1,23 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
|
4
|
-
begin
|
5
|
-
require 'jeweler'
|
6
|
-
Jeweler::Tasks.new do |gem|
|
7
|
-
gem.name = "remote_table"
|
8
|
-
gem.summary = %Q{Remotely open and parse XLS, ODS, CSV and fixed-width tables.}
|
9
|
-
gem.description = %Q{Remotely open and parse Excel XLS, ODS, CSV and fixed-width tables.}
|
10
|
-
gem.email = "seamus@abshere.net"
|
11
|
-
gem.homepage = "http://github.com/seamusabshere/remote_table"
|
12
|
-
gem.authors = ["Seamus Abshere", "Andy Rossmeissl"]
|
13
|
-
# sabshere [unknown date] roo 1.9.3 doesn't work, so use old 1.3 version
|
14
|
-
gem.add_dependency 'roo', '1.3.11'
|
15
|
-
# sabshere 9/30/10 depending on fastercsv when using ruby 1.9.2 results in exiting with error
|
16
|
-
# gem.add_dependency 'fastercsv', '>=1.5.0'
|
17
|
-
gem.add_dependency 'activesupport', '>=2.3.4'
|
18
|
-
gem.add_dependency 'slither', '>=0.99.4'
|
19
|
-
gem.add_dependency 'nokogiri', '>=1.4.1'
|
20
|
-
gem.add_dependency 'escape', '>=0.0.4'
|
21
|
-
gem.add_development_dependency 'errata', '>=0.2.0'
|
22
|
-
gem.require_path = "lib"
|
23
|
-
gem.rdoc_options << '--line-numbers' << '--inline-source'
|
24
|
-
gem.requirements << 'curl'
|
25
|
-
gem.rubyforge_project = "remotetable"
|
26
|
-
end
|
27
|
-
Jeweler::GemcutterTasks.new
|
28
|
-
# Jeweler::RubyforgeTasks.new do |rubyforge|
|
29
|
-
# rubyforge.doc_task = "rdoc"
|
30
|
-
# end
|
31
|
-
rescue LoadError
|
32
|
-
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
33
|
-
end
|
1
|
+
require 'bundler'
|
2
|
+
Bundler::GemHelper.install_tasks
|
34
3
|
|
4
|
+
require 'rake'
|
35
5
|
require 'rake/testtask'
|
36
6
|
Rake::TestTask.new(:test) do |test|
|
37
7
|
test.libs << 'lib' << 'test'
|
38
|
-
test.pattern = 'test
|
8
|
+
test.pattern = 'test/**/test_*.rb'
|
39
9
|
test.verbose = true
|
40
10
|
end
|
41
11
|
|
42
12
|
begin
|
43
|
-
require '
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
13
|
+
require 'rake/rdoctask'
|
14
|
+
Rake::RDocTask.new do |rdoc|
|
15
|
+
rdoc.rdoc_dir = 'rdoc'
|
16
|
+
rdoc.title = 'taps'
|
17
|
+
rdoc.options << '--line-numbers' << '--inline-source'
|
18
|
+
rdoc.rdoc_files.include('README*')
|
19
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
48
20
|
end
|
49
21
|
rescue LoadError
|
50
|
-
|
51
|
-
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
task :default => :test
|
59
|
-
|
60
|
-
require 'rake/rdoctask'
|
61
|
-
Rake::RDocTask.new do |rdoc|
|
62
|
-
if File.exist?('VERSION')
|
63
|
-
version = File.read('VERSION')
|
64
|
-
else
|
65
|
-
version = ""
|
66
|
-
end
|
67
|
-
|
68
|
-
rdoc.rdoc_dir = 'rdoc'
|
69
|
-
rdoc.title = "remote_table #{version}"
|
70
|
-
rdoc.rdoc_files.include('README*')
|
71
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
22
|
+
puts "Rdoc is not available"
|
72
23
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
require 'fileutils'
|
3
|
+
class RemoteTable
|
4
|
+
class Cleaner
|
5
|
+
include ::Singleton
|
6
|
+
def paths_for_removal
|
7
|
+
@paths_for_removal ||= []
|
8
|
+
end
|
9
|
+
def cleanup
|
10
|
+
paths_for_removal.each do |path|
|
11
|
+
::FileUtils.rm_rf path
|
12
|
+
paths_for_removal.delete path
|
13
|
+
end
|
14
|
+
end
|
15
|
+
def remove_at_exit(path)
|
16
|
+
paths_for_removal << path
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
require 'escape'
|
3
|
+
require 'fileutils'
|
4
|
+
class RemoteTable
|
5
|
+
class Executor
|
6
|
+
include ::Singleton
|
7
|
+
def bang(path, cmd)
|
8
|
+
tmp_path = "#{path}.bang.#{rand}"
|
9
|
+
backtick_with_reporting "/bin/cat #{::Escape.shell_single_word path} | #{cmd} > #{::Escape.shell_single_word tmp_path}"
|
10
|
+
::FileUtils.mv tmp_path, path
|
11
|
+
end
|
12
|
+
|
13
|
+
def backtick_with_reporting(cmd)
|
14
|
+
cmd = cmd.gsub /[ ]*\n[ ]*/m, ' '
|
15
|
+
output = `#{cmd}`
|
16
|
+
if not $?.success?
|
17
|
+
raise %{
|
18
|
+
From the remote_table gem...
|
19
|
+
|
20
|
+
Command failed:
|
21
|
+
#{cmd}
|
22
|
+
|
23
|
+
Output:
|
24
|
+
#{output}
|
25
|
+
}
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
if RUBY_VERSION >= '1.9'
|
2
|
+
require 'csv'
|
3
|
+
::FasterCSV = ::CSV
|
4
|
+
else
|
5
|
+
begin
|
6
|
+
require 'fastercsv'
|
7
|
+
rescue ::LoadError
|
8
|
+
$stderr.puts "[remote_table gem] You probably need to manually install the fastercsv gem and/or require it in your Gemfile."
|
9
|
+
raise $!
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
class RemoteTable
|
14
|
+
class Format
|
15
|
+
class Delimited < Format
|
16
|
+
include Textual
|
17
|
+
def each(&blk)
|
18
|
+
backup_file!
|
19
|
+
convert_file_to_utf8!
|
20
|
+
remove_useless_characters!
|
21
|
+
skip_rows!
|
22
|
+
::FasterCSV.foreach(t.local_file.path, fastercsv_options) do |row|
|
23
|
+
ordered_hash = ::ActiveSupport::OrderedHash.new
|
24
|
+
filled_values = 0
|
25
|
+
case row
|
26
|
+
when ::FasterCSV::Row
|
27
|
+
row.each do |header, value|
|
28
|
+
next if header.blank?
|
29
|
+
value = '' if value.nil?
|
30
|
+
ordered_hash[header] = value
|
31
|
+
filled_values += 1 if value.present?
|
32
|
+
end
|
33
|
+
when ::Array
|
34
|
+
index = 0
|
35
|
+
row.each do |value|
|
36
|
+
value = '' if value.nil?
|
37
|
+
ordered_hash[index] = value
|
38
|
+
filled_values += 1 if value.present?
|
39
|
+
index += 1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
yield ordered_hash if t.properties.keep_blank_rows or filled_values > 0
|
43
|
+
end
|
44
|
+
ensure
|
45
|
+
restore_file!
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
def fastercsv_options
|
51
|
+
fastercsv_options = { :skip_blanks => !t.properties.keep_blank_rows }
|
52
|
+
if t.properties.headers == false
|
53
|
+
fastercsv_options.merge!(:headers => nil)
|
54
|
+
else
|
55
|
+
fastercsv_options.merge!(:headers => :first_row)
|
56
|
+
end
|
57
|
+
fastercsv_options.merge!(:col_sep => t.properties.delimiter) if t.properties.delimiter
|
58
|
+
fastercsv_options
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'slither'
|
2
|
+
class RemoteTable
|
3
|
+
class Format
|
4
|
+
class FixedWidth < Format
|
5
|
+
include Textual
|
6
|
+
def each(&blk)
|
7
|
+
backup_file!
|
8
|
+
convert_file_to_utf8!
|
9
|
+
remove_useless_characters!
|
10
|
+
crop_rows!
|
11
|
+
skip_rows!
|
12
|
+
cut_columns!
|
13
|
+
parser.parse[:rows].each do |hash|
|
14
|
+
hash.reject! { |k, v| k.blank? }
|
15
|
+
yield hash if t.properties.keep_blank_rows or hash.any? { |k, v| v.present? }
|
16
|
+
end
|
17
|
+
ensure
|
18
|
+
restore_file!
|
19
|
+
end
|
20
|
+
private
|
21
|
+
def parser
|
22
|
+
@parser ||= ::Slither::Parser.new definition, t.local_file.path
|
23
|
+
end
|
24
|
+
def definition
|
25
|
+
@definition ||= if t.properties.schema_name.is_a?(::String) or t.properties.schema_name.is_a?(::Symbol)
|
26
|
+
::Slither.send :definition, t.properties.schema_name
|
27
|
+
elsif t.properties.schema.is_a?(::Array)
|
28
|
+
everything = lambda { |_| true }
|
29
|
+
::Slither.define(rand.to_s) do |d|
|
30
|
+
d.rows do |row|
|
31
|
+
row.trap(&everything)
|
32
|
+
t.properties.schema.each do |name, width, options|
|
33
|
+
if name == 'spacer'
|
34
|
+
row.spacer width
|
35
|
+
else
|
36
|
+
row.column name, width, options
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
else
|
42
|
+
raise "expecting schema_name to be a String or Symbol, or schema to be an Array"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'cgi'
|
3
|
+
class RemoteTable
|
4
|
+
class Format
|
5
|
+
class HTML < Format
|
6
|
+
include Textual
|
7
|
+
def each(&blk)
|
8
|
+
backup_file!
|
9
|
+
convert_file_to_utf8!
|
10
|
+
remove_useless_characters!
|
11
|
+
html_headers = (t.properties.headers.is_a?(::Array)) ? t.properties.headers : nil
|
12
|
+
::Nokogiri::HTML(unescaped_html_without_soft_hyphens, nil, 'UTF-8').xpath(t.properties.row_xpath).each do |row|
|
13
|
+
values = row.xpath(t.properties.column_xpath).map { |td| td.content.gsub(/\s+/, ' ').strip }
|
14
|
+
if html_headers.nil?
|
15
|
+
html_headers = values
|
16
|
+
next
|
17
|
+
end
|
18
|
+
hash = zip html_headers, values
|
19
|
+
yield hash if t.properties.keep_blank_rows or hash.any? { |k, v| v.present? }
|
20
|
+
end
|
21
|
+
ensure
|
22
|
+
restore_file!
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
# http://snippets.dzone.com/posts/show/406
|
28
|
+
def zip(keys, values)
|
29
|
+
hash = ::Hash.new
|
30
|
+
keys.zip(values) { |k,v| hash[k]=v }
|
31
|
+
hash
|
32
|
+
end
|
33
|
+
|
34
|
+
# should we be doing this in ruby?
|
35
|
+
def unescaped_html_without_soft_hyphens
|
36
|
+
str = ::CGI.unescapeHTML ::IO.read(t.local_file.path)
|
37
|
+
# get rid of MS Office baddies
|
38
|
+
str.gsub! /­/, ''
|
39
|
+
str
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'roo'
|
2
|
+
class RemoteTable
|
3
|
+
class Format
|
4
|
+
module Rooable
|
5
|
+
def each(&blk)
|
6
|
+
spreadsheet = roo_class.new t.local_file.path, nil, :ignore
|
7
|
+
spreadsheet.default_sheet = t.properties.sheet.is_a?(::Numeric) ? spreadsheet.sheets[t.properties.sheet] : t.properties.sheet
|
8
|
+
column_references = ::Hash.new
|
9
|
+
if t.properties.headers == false
|
10
|
+
# zero-based numeric keys
|
11
|
+
for col in (1..spreadsheet.last_column)
|
12
|
+
column_references[col] = col - 1
|
13
|
+
end
|
14
|
+
elsif t.properties.headers.is_a? ::Array
|
15
|
+
# names
|
16
|
+
for col in (1..spreadsheet.last_column)
|
17
|
+
column_references[col] = t.properties.headers[col - 1]
|
18
|
+
end
|
19
|
+
else
|
20
|
+
# read t.properties.headers from the file itself
|
21
|
+
for col in (1..spreadsheet.last_column)
|
22
|
+
column_references[col] = spreadsheet.cell(header_row, col)
|
23
|
+
column_references[col] = spreadsheet.cell(header_row - 1, col) if column_references[col].blank? # lspreadsheetk up
|
24
|
+
end
|
25
|
+
end
|
26
|
+
first_data_row.upto(spreadsheet.last_row) do |raw_row|
|
27
|
+
ordered_hash = ::ActiveSupport::OrderedHash.new
|
28
|
+
for col in (1..spreadsheet.last_column)
|
29
|
+
next if column_references[col].blank?
|
30
|
+
ordered_hash[column_references[col]] = spreadsheet.cell(raw_row, col).to_s.gsub(/<[^>]+>/, '').strip
|
31
|
+
end
|
32
|
+
yield ordered_hash if t.properties.keep_blank_rows or ordered_hash.any? { |k, v| v.present? }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
def header_row
|
39
|
+
1 + t.properties.skip
|
40
|
+
end
|
41
|
+
|
42
|
+
def first_data_row
|
43
|
+
1 + header_row
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'escape'
|
3
|
+
class RemoteTable
|
4
|
+
class Format
|
5
|
+
module Textual
|
6
|
+
def convert_file_to_utf8!
|
7
|
+
::RemoteTable.executor.bang t.local_file.path, "iconv -c -f #{::Escape.shell_single_word t.properties.encoding} -t UTF-8"
|
8
|
+
end
|
9
|
+
|
10
|
+
USELESS_CHARACTERS = [
|
11
|
+
'\xef\xbb\xbf', # UTF-8 byte order mark
|
12
|
+
'\xc2\xad' # soft hyphen, often inserted by MS Office (html: ­)
|
13
|
+
]
|
14
|
+
def remove_useless_characters!
|
15
|
+
::RemoteTable.executor.bang t.local_file.path, "perl -pe 's/#{USELESS_CHARACTERS.join '//g; s/'}//g'"
|
16
|
+
end
|
17
|
+
|
18
|
+
def skip_rows!
|
19
|
+
return unless t.properties.skip > 0
|
20
|
+
::RemoteTable.executor.bang t.local_file.path, "tail -n +#{t.properties.skip + 1}"
|
21
|
+
end
|
22
|
+
|
23
|
+
def crop_rows!
|
24
|
+
return unless t.properties.crop
|
25
|
+
::RemoteTable.executor.bang t.local_file.path, "tail -n +#{::Escape.shell_single_word t.properties.crop.first.to_s} | head -n #{t.properties.crop.last - t.properties.crop.first + 1}"
|
26
|
+
end
|
27
|
+
|
28
|
+
def cut_columns!
|
29
|
+
return unless t.properties.cut
|
30
|
+
::RemoteTable.executor.bang t.local_file.path, "cut -c #{::Escape.shell_single_word t.properties.cut.to_s}"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
class RemoteTable
|
2
|
+
class Format
|
3
|
+
class Unknown < StandardError; end
|
4
|
+
|
5
|
+
autoload :Excel, 'remote_table/format/excel'
|
6
|
+
autoload :Excelx, 'remote_table/format/excelx'
|
7
|
+
autoload :Delimited, 'remote_table/format/delimited'
|
8
|
+
autoload :OpenOffice, 'remote_table/format/open_office'
|
9
|
+
autoload :FixedWidth, 'remote_table/format/fixed_width'
|
10
|
+
autoload :HTML, 'remote_table/format/html'
|
11
|
+
|
12
|
+
autoload :Textual, 'remote_table/format/mixins/textual'
|
13
|
+
autoload :Rooable, 'remote_table/format/mixins/rooable'
|
14
|
+
|
15
|
+
attr_reader :t
|
16
|
+
|
17
|
+
def initialize(t)
|
18
|
+
@t = t
|
19
|
+
end
|
20
|
+
|
21
|
+
include ::Enumerable
|
22
|
+
def each
|
23
|
+
raise "must be defined by format"
|
24
|
+
end
|
25
|
+
|
26
|
+
def backup_file!
|
27
|
+
::FileUtils.cp t.local_file.path, "#{t.local_file.path}.backup"
|
28
|
+
end
|
29
|
+
|
30
|
+
def restore_file!
|
31
|
+
return unless ::File.readable? "#{t.local_file.path}.backup"
|
32
|
+
::FileUtils.mv "#{t.local_file.path}.backup", t.local_file.path
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
require 'digest/md5'
|
3
|
+
class RemoteTable
|
4
|
+
class Hasher
|
5
|
+
include ::Singleton
|
6
|
+
def hash(row)
|
7
|
+
normalized_hash = if RUBY_VERSION >= '1.9'
|
8
|
+
row.keys.sort.inject(::Hash.new) do |memo, k|
|
9
|
+
normalized_k = k.to_s.toutf8
|
10
|
+
normalized_v = row[k].respond_to?(:to_s) ? row[k].to_s.toutf8 : row[k]
|
11
|
+
memo[normalized_k] = normalized_v
|
12
|
+
memo
|
13
|
+
end
|
14
|
+
else
|
15
|
+
::Hash.new.replace(row)
|
16
|
+
end
|
17
|
+
# sabshere 1/21/11 may currently break across versions of ruby
|
18
|
+
# ruby-1.8.7-p174 > Marshal.dump({'a' => '1'})
|
19
|
+
# => "\004\b{\006\"\006a\"\0061"
|
20
|
+
# ruby-1.9.2-p0 > Marshal.dump({'a' => '1'})
|
21
|
+
# => "\x04\b{\x06I\"\x06a\x06:\x06ETI\"\x061\x06;\x00T"
|
22
|
+
::Digest::MD5.hexdigest ::Marshal.dump(normalized_hash)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|