remote_table 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/CHANGELOG +19 -0
- data/Gemfile +7 -1
- data/README.markdown +440 -0
- data/Rakefile +6 -14
- data/lib/remote_table.rb +27 -38
- data/lib/remote_table/{properties.rb → config.rb} +39 -43
- data/lib/remote_table/format.rb +24 -27
- data/lib/remote_table/format/delimited.rb +17 -21
- data/lib/remote_table/format/fixed_width.rb +9 -9
- data/lib/remote_table/format/html.rb +0 -2
- data/lib/remote_table/format/mixins/processed_by_nokogiri.rb +13 -12
- data/lib/remote_table/format/mixins/processed_by_roo.rb +17 -13
- data/lib/remote_table/format/mixins/textual.rb +13 -13
- data/lib/remote_table/format/open_office.rb +3 -0
- data/lib/remote_table/format/xml.rb +0 -2
- data/lib/remote_table/format/yaml.rb +14 -0
- data/lib/remote_table/local_file.rb +69 -7
- data/lib/remote_table/transformer.rb +7 -4
- data/lib/remote_table/version.rb +1 -1
- data/remote_table.gemspec +5 -13
- data/test/fixtures/data.yml +4 -0
- data/test/helper.rb +8 -9
- data/test/test_big.rb +43 -53
- data/test/test_errata.rb +27 -25
- data/test/test_old_syntax.rb +193 -191
- data/test/test_old_transform.rb +12 -10
- data/test/test_remote_table.rb +57 -47
- metadata +48 -64
- data/.document +0 -5
- data/README.rdoc +0 -167
- data/lib/remote_table/utils.rb +0 -157
@@ -1,37 +1,33 @@
|
|
1
|
-
if RUBY_VERSION >= '1.9'
|
2
|
-
require 'csv'
|
3
|
-
::RemoteTable::MyCSV = ::CSV
|
4
|
-
else
|
5
|
-
begin
|
6
|
-
require 'fastercsv'
|
7
|
-
::RemoteTable::MyCSV = ::FasterCSV
|
8
|
-
rescue ::LoadError
|
9
|
-
$stderr.puts "[remote_table] You probably need to manually install the fastercsv gem and/or require it in your Gemfile."
|
10
|
-
raise $!
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
1
|
class RemoteTable
|
15
2
|
class Format
|
16
3
|
class Delimited < Format
|
4
|
+
if ::RUBY_VERSION >= '1.9'
|
5
|
+
require 'csv'
|
6
|
+
Engine = ::CSV
|
7
|
+
else
|
8
|
+
require 'fastercsv'
|
9
|
+
Engine = ::FasterCSV
|
10
|
+
end
|
11
|
+
|
17
12
|
include Textual
|
13
|
+
|
18
14
|
def each(&blk)
|
19
15
|
remove_useless_characters!
|
20
16
|
fix_newlines!
|
21
17
|
transliterate_whole_file_to_utf8!
|
22
18
|
skip_rows!
|
23
|
-
|
24
|
-
if row.is_a?(
|
19
|
+
Engine.new(t.local_file.encoded_io, fastercsv_options).each do |row|
|
20
|
+
if row.is_a?(Engine::Row)
|
25
21
|
hash = row.inject(::ActiveSupport::OrderedHash.new) do |memo, (k, v)|
|
26
22
|
if k.present?
|
27
23
|
memo[k] = v.to_s
|
28
24
|
end
|
29
25
|
memo
|
30
26
|
end
|
31
|
-
yield hash if t.
|
27
|
+
yield hash if t.config.keep_blank_rows or hash.any? { |k, v| v.present? }
|
32
28
|
elsif row.is_a?(::Array)
|
33
29
|
array = row.map { |v| v.to_s }
|
34
|
-
yield array if t.
|
30
|
+
yield array if t.config.keep_blank_rows or array.any? { |v| v.present? }
|
35
31
|
end
|
36
32
|
end
|
37
33
|
ensure
|
@@ -54,10 +50,10 @@ class RemoteTable
|
|
54
50
|
]
|
55
51
|
|
56
52
|
def fastercsv_options
|
57
|
-
hsh = t.
|
58
|
-
hsh
|
59
|
-
hsh.reverse_merge! :headers => t.
|
60
|
-
hsh.reverse_merge! :col_sep => t.
|
53
|
+
hsh = t.config.user_specified_options.slice *FASTERCSV_OPTIONS
|
54
|
+
hsh[:skip_blanks] = !t.config.keep_blank_rows
|
55
|
+
hsh.reverse_merge! :headers => t.config.headers
|
56
|
+
hsh.reverse_merge! :col_sep => t.config.delimiter
|
61
57
|
end
|
62
58
|
end
|
63
59
|
end
|
@@ -1,10 +1,10 @@
|
|
1
|
-
require 'fixed_width'
|
2
|
-
|
3
1
|
class RemoteTable
|
4
2
|
class Format
|
5
3
|
class FixedWidth < Format
|
6
4
|
include Textual
|
7
5
|
def each(&blk)
|
6
|
+
require 'fixed_width-multibyte'
|
7
|
+
|
8
8
|
remove_useless_characters!
|
9
9
|
fix_newlines!
|
10
10
|
transliterate_whole_file_to_utf8!
|
@@ -16,7 +16,7 @@ class RemoteTable
|
|
16
16
|
row.each do |k, v|
|
17
17
|
row[k] = v.strip
|
18
18
|
end
|
19
|
-
yield row if t.
|
19
|
+
yield row if t.config.keep_blank_rows or row.any? { |k, v| v.present? }
|
20
20
|
end
|
21
21
|
ensure
|
22
22
|
t.local_file.cleanup
|
@@ -27,21 +27,21 @@ class RemoteTable
|
|
27
27
|
def parser
|
28
28
|
return @parser if @parser.is_a?(::FixedWidth::Parser)
|
29
29
|
if ::FixedWidth::Section.private_instance_methods.map(&:to_sym).include?(:unpacker)
|
30
|
-
raise "
|
30
|
+
raise ::RuntimeError, "You need a different (newer) version of the FixedWidth gem that supports multibyte encoding, sometime after https://github.com/timonk/fixed_width/pull/1 was incorporated"
|
31
31
|
end
|
32
32
|
@parser = ::FixedWidth::Parser.new definition, t.local_file.encoded_io
|
33
33
|
end
|
34
34
|
|
35
35
|
def definition
|
36
|
-
@definition ||= if t.
|
37
|
-
::FixedWidth.send :definition, t.
|
38
|
-
elsif t.
|
36
|
+
@definition ||= if t.config.schema_name.is_a?(::String) or t.config.schema_name.is_a?(::Symbol)
|
37
|
+
::FixedWidth.send :definition, t.config.schema_name
|
38
|
+
elsif t.config.schema.is_a?(::Array)
|
39
39
|
everything = lambda { |_| true }
|
40
40
|
srand # in case this was forked by resque
|
41
41
|
::FixedWidth.define(rand.to_s) do |d|
|
42
42
|
d.rows do |row|
|
43
43
|
row.trap(&everything)
|
44
|
-
t.
|
44
|
+
t.config.schema.each do |name, width, options|
|
45
45
|
name = name.to_s
|
46
46
|
if name == 'spacer'
|
47
47
|
row.spacer width
|
@@ -52,7 +52,7 @@ class RemoteTable
|
|
52
52
|
end
|
53
53
|
end
|
54
54
|
else
|
55
|
-
raise "expecting schema_name to be a String or Symbol, or schema to be an Array"
|
55
|
+
raise ::ArgumentError, "expecting schema_name to be a String or Symbol, or schema to be an Array"
|
56
56
|
end
|
57
57
|
end
|
58
58
|
end
|
@@ -1,21 +1,22 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
require 'cgi'
|
3
1
|
class RemoteTable
|
4
2
|
class Format
|
5
3
|
module ProcessedByNokogiri
|
6
4
|
def each
|
7
|
-
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'cgi'
|
7
|
+
|
8
|
+
raise ::ArgumentError, "Need :row_css or :row_xpath in order to process XML or HTML" unless t.config.row_css or t.config.row_xpath
|
8
9
|
remove_useless_characters!
|
9
10
|
transliterate_whole_file_to_utf8!
|
10
11
|
|
11
|
-
headers = t.
|
12
|
+
headers = t.config.headers
|
12
13
|
|
13
14
|
xml = nokogiri_class.parse(unescaped_xml_without_soft_hyphens, nil, 'UTF-8')
|
14
|
-
(row_css? ? xml.css(t.
|
15
|
+
(row_css? ? xml.css(t.config.row_css) : xml.xpath(t.config.row_xpath)).each do |row|
|
15
16
|
values = if column_css?
|
16
|
-
row.css(t.
|
17
|
+
row.css(t.config.column_css)
|
17
18
|
elsif column_xpath?
|
18
|
-
row.xpath(t.
|
19
|
+
row.xpath(t.config.column_xpath)
|
19
20
|
else
|
20
21
|
[row]
|
21
22
|
end.map { |cell| assume_utf8 cell.content.gsub(/\s+/, ' ').strip }
|
@@ -23,12 +24,12 @@ class RemoteTable
|
|
23
24
|
headers = values.select(&:present?)
|
24
25
|
next
|
25
26
|
end
|
26
|
-
output = if t.
|
27
|
+
output = if t.config.output_class == ::Array
|
27
28
|
values
|
28
29
|
else
|
29
30
|
zip headers, values
|
30
31
|
end
|
31
|
-
if t.
|
32
|
+
if t.config.keep_blank_rows or values.any?
|
32
33
|
yield output
|
33
34
|
end
|
34
35
|
end
|
@@ -39,15 +40,15 @@ class RemoteTable
|
|
39
40
|
private
|
40
41
|
|
41
42
|
def row_css?
|
42
|
-
!!t.
|
43
|
+
!!t.config.row_css
|
43
44
|
end
|
44
45
|
|
45
46
|
def column_css?
|
46
|
-
!!t.
|
47
|
+
!!t.config.column_css
|
47
48
|
end
|
48
49
|
|
49
50
|
def column_xpath?
|
50
|
-
!!t.
|
51
|
+
!!t.config.column_xpath
|
51
52
|
end
|
52
53
|
|
53
54
|
# http://snippets.dzone.com/posts/show/406
|
@@ -1,33 +1,37 @@
|
|
1
|
-
require 'roo'
|
2
1
|
class RemoteTable
|
3
2
|
class Format
|
4
3
|
module ProcessedByRoo
|
5
4
|
def each(&blk)
|
5
|
+
require 'iconv'
|
6
|
+
require 'roo'
|
7
|
+
|
6
8
|
spreadsheet = roo_class.new t.local_file.path, nil, :ignore
|
7
|
-
|
9
|
+
if t.config.sheet
|
10
|
+
spreadsheet.default_sheet = t.config.sheet
|
11
|
+
end
|
8
12
|
|
9
|
-
first_row = if t.
|
10
|
-
t.
|
13
|
+
first_row = if t.config.crop
|
14
|
+
t.config.crop.first + 1
|
11
15
|
else
|
12
|
-
t.
|
16
|
+
t.config.skip + 1
|
13
17
|
end
|
14
18
|
|
15
|
-
last_row = if t.
|
16
|
-
t.
|
19
|
+
last_row = if t.config.crop
|
20
|
+
t.config.crop.last
|
17
21
|
else
|
18
22
|
spreadsheet.last_row
|
19
23
|
end
|
20
24
|
|
21
|
-
if t.
|
25
|
+
if t.config.output_class == ::Array
|
22
26
|
(first_row..last_row).each do |y|
|
23
27
|
output = (1..spreadsheet.last_column).map do |x|
|
24
28
|
assume_utf8 spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
|
25
29
|
end
|
26
|
-
yield output if t.
|
30
|
+
yield output if t.config.keep_blank_rows or output.any? { |v| v.present? }
|
27
31
|
end
|
28
32
|
else
|
29
|
-
headers =
|
30
|
-
if t.
|
33
|
+
headers = ::ActiveSupport::OrderedHash.new
|
34
|
+
if t.config.use_first_row_as_header?
|
31
35
|
(1..spreadsheet.last_column).each do |x|
|
32
36
|
v = spreadsheet.cell(first_row, x)
|
33
37
|
v = spreadsheet.cell(first_row - 1, x) if v.blank? # look up
|
@@ -39,7 +43,7 @@ class RemoteTable
|
|
39
43
|
# "advance the cursor"
|
40
44
|
first_row += 1
|
41
45
|
else
|
42
|
-
t.
|
46
|
+
t.config.headers.each_with_index do |k, i|
|
43
47
|
headers[k] = i + 1
|
44
48
|
end
|
45
49
|
end
|
@@ -48,7 +52,7 @@ class RemoteTable
|
|
48
52
|
headers.each do |k, x|
|
49
53
|
output[k] = assume_utf8 spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
|
50
54
|
end
|
51
|
-
yield output if t.
|
55
|
+
yield output if t.config.keep_blank_rows or output.any? { |k, v| v.present? }
|
52
56
|
end
|
53
57
|
end
|
54
58
|
ensure
|
@@ -7,36 +7,36 @@ class RemoteTable
|
|
7
7
|
'\xc2\xad', # soft hyphen, often inserted by MS Office (html: ­)
|
8
8
|
]
|
9
9
|
def remove_useless_characters!
|
10
|
-
|
11
|
-
if t.
|
10
|
+
t.local_file.in_place :perl, "s/#{USELESS_CHARACTERS.join('//g; s/')}//g"
|
11
|
+
if t.config.internal_encoding =~ /windows.?1252/i
|
12
12
|
# soft hyphen again, as I have seen it appear in windows 1252
|
13
|
-
|
13
|
+
t.local_file.in_place :perl, 's/\xad//g'
|
14
14
|
end
|
15
15
|
end
|
16
16
|
|
17
17
|
def transliterate_whole_file_to_utf8!
|
18
|
-
|
19
|
-
t.
|
18
|
+
t.local_file.in_place :iconv, t.config.external_encoding_iconv, t.config.internal_encoding
|
19
|
+
t.config.user_specified_options[:encoding] = t.config.external_encoding
|
20
20
|
end
|
21
21
|
|
22
22
|
def fix_newlines!
|
23
|
-
|
23
|
+
t.local_file.in_place :perl, 's/\r\n|\n|\r/\n/g'
|
24
24
|
end
|
25
25
|
|
26
26
|
def skip_rows!
|
27
|
-
return unless t.
|
28
|
-
|
27
|
+
return unless t.config.skip > 0
|
28
|
+
t.local_file.in_place :tail, "+#{t.config.skip + 1}"
|
29
29
|
end
|
30
30
|
|
31
31
|
def crop_rows!
|
32
|
-
return unless t.
|
33
|
-
|
34
|
-
|
32
|
+
return unless t.config.crop
|
33
|
+
t.local_file.in_place :tail, "+#{t.config.crop.first}"
|
34
|
+
t.local_file.in_place :head, (t.config.crop.last - t.config.crop.first + 1)
|
35
35
|
end
|
36
36
|
|
37
37
|
def cut_columns!
|
38
|
-
return unless t.
|
39
|
-
|
38
|
+
return unless t.config.cut
|
39
|
+
t.local_file.in_place :cut, t.config.cut
|
40
40
|
end
|
41
41
|
end
|
42
42
|
end
|
@@ -3,6 +3,9 @@ class RemoteTable
|
|
3
3
|
class OpenOffice < Format
|
4
4
|
include ProcessedByRoo
|
5
5
|
def roo_class
|
6
|
+
if ::RUBY_PLATFORM == 'java'
|
7
|
+
::Kernel.warn "[remote_table] Opening ODS files on JRuby is known to fail because of a flaw in the underlying Roo library"
|
8
|
+
end
|
6
9
|
::Openoffice
|
7
10
|
end
|
8
11
|
end
|
@@ -1,13 +1,71 @@
|
|
1
1
|
require 'fileutils'
|
2
|
+
require 'unix_utils'
|
2
3
|
|
3
4
|
class RemoteTable
|
4
5
|
class LocalFile #:nodoc:all
|
6
|
+
class << self
|
7
|
+
def decompress(input, compression)
|
8
|
+
output = case compression
|
9
|
+
when :zip, :exe
|
10
|
+
::UnixUtils.unzip input
|
11
|
+
when :bz2
|
12
|
+
::UnixUtils.bunzip2 input
|
13
|
+
when :gz
|
14
|
+
::UnixUtils.gunzip input
|
15
|
+
else
|
16
|
+
raise ::ArgumentError, "Unrecognized compression #{compression}"
|
17
|
+
end
|
18
|
+
::FileUtils.rm_f input
|
19
|
+
output
|
20
|
+
end
|
21
|
+
|
22
|
+
def unpack(input, packing)
|
23
|
+
output = case packing
|
24
|
+
when :tar
|
25
|
+
::UnixUtils.untar input
|
26
|
+
else
|
27
|
+
raise ::ArgumentError, "Unrecognized packing #{packing}"
|
28
|
+
end
|
29
|
+
::FileUtils.rm_f input
|
30
|
+
output
|
31
|
+
end
|
32
|
+
|
33
|
+
def pick(input, options = {})
|
34
|
+
options = options.symbolize_keys
|
35
|
+
if (options[:filename] or options[:glob]) and not ::File.directory?(input)
|
36
|
+
raise ::RuntimeError, "Expecting #{input} to be a directory"
|
37
|
+
end
|
38
|
+
if filename = options[:filename]
|
39
|
+
src = ::File.join input, filename
|
40
|
+
raise(::RuntimeError, "Expecting #{src} to be a file") unless ::File.file?(src)
|
41
|
+
output = ::UnixUtils.tmp_path src
|
42
|
+
::FileUtils.mv src, output
|
43
|
+
::FileUtils.rm_rf input if ::File.dirname(input).start_with?(::Dir.tmpdir)
|
44
|
+
elsif glob = options[:glob]
|
45
|
+
src = ::Dir[input+glob].first
|
46
|
+
raise(::RuntimeError, "Expecting #{glob} to find a file in #{input}") unless src and ::File.file?(src)
|
47
|
+
output = ::UnixUtils.tmp_path src
|
48
|
+
::FileUtils.mv src, output
|
49
|
+
::FileUtils.rm_rf input if ::File.dirname(input).start_with?(::Dir.tmpdir)
|
50
|
+
else
|
51
|
+
output = ::UnixUtils.tmp_path input
|
52
|
+
::FileUtils.mv input, output
|
53
|
+
end
|
54
|
+
output
|
55
|
+
end
|
56
|
+
end
|
5
57
|
|
6
58
|
attr_reader :t
|
7
59
|
|
8
60
|
def initialize(t)
|
9
61
|
@t = t
|
10
62
|
end
|
63
|
+
|
64
|
+
def in_place(*args)
|
65
|
+
bin = args.shift
|
66
|
+
tmp_path = ::UnixUtils.send(*([bin,path]+args))
|
67
|
+
::FileUtils.mv tmp_path, path
|
68
|
+
end
|
11
69
|
|
12
70
|
def path
|
13
71
|
generate unless generated?
|
@@ -16,7 +74,7 @@ class RemoteTable
|
|
16
74
|
|
17
75
|
def encoded_io
|
18
76
|
@encoded_io ||= if ::RUBY_VERSION >= '1.9'
|
19
|
-
::File.open path, 'rb', :internal_encoding => t.
|
77
|
+
::File.open path, 'rb', :internal_encoding => t.config.internal_encoding, :external_encoding => t.config.external_encoding
|
20
78
|
else
|
21
79
|
::File.open path, 'rb'
|
22
80
|
end
|
@@ -41,14 +99,18 @@ class RemoteTable
|
|
41
99
|
end
|
42
100
|
|
43
101
|
def generate
|
44
|
-
|
45
|
-
if
|
46
|
-
|
102
|
+
# sabshere 7/20/11 make web requests move more slowly so you don't get accused of DOS
|
103
|
+
if ::ENV.has_key?('REMOTE_TABLE_DELAY_BETWEEN_REQUESTS')
|
104
|
+
::Kernel.sleep ::ENV['REMOTE_TABLE_DELAY_BETWEEN_REQUESTS'].to_i
|
105
|
+
end
|
106
|
+
tmp_path = ::UnixUtils.curl t.config.uri.to_s, t.config.form_data
|
107
|
+
if compression = t.config.compression
|
108
|
+
tmp_path = LocalFile.decompress tmp_path, compression
|
47
109
|
end
|
48
|
-
if packing = t.
|
49
|
-
tmp_path =
|
110
|
+
if packing = t.config.packing
|
111
|
+
tmp_path = LocalFile.unpack tmp_path, packing
|
50
112
|
end
|
51
|
-
@path =
|
113
|
+
@path = LocalFile.pick tmp_path, :filename => t.config.filename, :glob => t.config.glob
|
52
114
|
@generated = true
|
53
115
|
end
|
54
116
|
end
|