remote_table 1.4.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +17 -0
- data/README.markdown +59 -37
- data/lib/remote_table.rb +478 -56
- data/lib/remote_table/delimited.rb +91 -0
- data/lib/remote_table/fixed_width.rb +81 -0
- data/lib/remote_table/html.rb +13 -0
- data/lib/remote_table/{local_file.rb → local_copy.rb} +26 -22
- data/lib/remote_table/ods.rb +17 -0
- data/lib/remote_table/plaintext.rb +67 -0
- data/lib/remote_table/processed_by_nokogiri.rb +76 -0
- data/lib/remote_table/processed_by_roo.rb +97 -0
- data/lib/remote_table/transformer.rb +9 -5
- data/lib/remote_table/version.rb +1 -1
- data/lib/remote_table/xls.rb +11 -0
- data/lib/remote_table/xlsx.rb +11 -0
- data/lib/remote_table/xml.rb +13 -0
- data/lib/remote_table/yaml.rb +14 -0
- data/remote_table.gemspec +2 -2
- data/test/test_big.rb +1 -1
- data/test/test_remote_table.rb +26 -21
- metadata +19 -20
- data/lib/remote_table/config.rb +0 -251
- data/lib/remote_table/format.rb +0 -49
- data/lib/remote_table/format/delimited.rb +0 -60
- data/lib/remote_table/format/excel.rb +0 -10
- data/lib/remote_table/format/excelx.rb +0 -10
- data/lib/remote_table/format/fixed_width.rb +0 -60
- data/lib/remote_table/format/html.rb +0 -12
- data/lib/remote_table/format/mixins/processed_by_nokogiri.rb +0 -70
- data/lib/remote_table/format/mixins/processed_by_roo.rb +0 -63
- data/lib/remote_table/format/mixins/textual.rb +0 -43
- data/lib/remote_table/format/open_office.rb +0 -13
- data/lib/remote_table/format/xml.rb +0 -12
- data/lib/remote_table/format/yaml.rb +0 -14
data/lib/remote_table/format.rb
DELETED
@@ -1,49 +0,0 @@
|
|
1
|
-
require 'iconv'
|
2
|
-
if RUBY_VERSION >= '1.9'
|
3
|
-
# for an excellent explanation see http://blog.segment7.net/2010/12/17/from-iconv-iconv-to-string-encode
|
4
|
-
Kernel.warn "[remote_table] Apologies - using iconv because Ruby 1.9.x's String#encode doesn't have transliteration tables (yet)"
|
5
|
-
end
|
6
|
-
|
7
|
-
require 'remote_table/format/mixins/textual'
|
8
|
-
require 'remote_table/format/mixins/processed_by_roo'
|
9
|
-
require 'remote_table/format/mixins/processed_by_nokogiri'
|
10
|
-
require 'remote_table/format/excel'
|
11
|
-
require 'remote_table/format/excelx'
|
12
|
-
require 'remote_table/format/delimited'
|
13
|
-
require 'remote_table/format/open_office'
|
14
|
-
require 'remote_table/format/fixed_width'
|
15
|
-
require 'remote_table/format/html'
|
16
|
-
require 'remote_table/format/xml'
|
17
|
-
require 'remote_table/format/yaml'
|
18
|
-
class RemoteTable
|
19
|
-
class Format
|
20
|
-
|
21
|
-
attr_reader :t
|
22
|
-
|
23
|
-
def initialize(t)
|
24
|
-
@t = t
|
25
|
-
end
|
26
|
-
|
27
|
-
def transliterate_to_utf8(str)
|
28
|
-
if str.is_a?(::String)
|
29
|
-
[ iconv.iconv(str), iconv.iconv(nil) ].join
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
def assume_utf8(str)
|
34
|
-
if str.is_a?(::String) and ::RUBY_VERSION >= '1.9'
|
35
|
-
str.encode! t.config.external_encoding
|
36
|
-
else
|
37
|
-
str
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
private
|
42
|
-
|
43
|
-
def iconv
|
44
|
-
@iconv ||= ::Iconv.new(t.config.external_encoding_iconv, t.config.internal_encoding)
|
45
|
-
end
|
46
|
-
|
47
|
-
include ::Enumerable
|
48
|
-
end
|
49
|
-
end
|
@@ -1,60 +0,0 @@
|
|
1
|
-
class RemoteTable
|
2
|
-
class Format
|
3
|
-
class Delimited < Format
|
4
|
-
if ::RUBY_VERSION >= '1.9'
|
5
|
-
require 'csv'
|
6
|
-
Engine = ::CSV
|
7
|
-
else
|
8
|
-
require 'fastercsv'
|
9
|
-
Engine = ::FasterCSV
|
10
|
-
end
|
11
|
-
|
12
|
-
include Textual
|
13
|
-
|
14
|
-
def each(&blk)
|
15
|
-
remove_useless_characters!
|
16
|
-
fix_newlines!
|
17
|
-
transliterate_whole_file_to_utf8!
|
18
|
-
skip_rows!
|
19
|
-
Engine.new(t.local_file.encoded_io, fastercsv_options).each do |row|
|
20
|
-
if row.is_a?(Engine::Row)
|
21
|
-
hash = row.inject(::ActiveSupport::OrderedHash.new) do |memo, (k, v)|
|
22
|
-
if k.present?
|
23
|
-
memo[k] = v.to_s
|
24
|
-
end
|
25
|
-
memo
|
26
|
-
end
|
27
|
-
yield hash if t.config.keep_blank_rows or hash.any? { |k, v| v.present? }
|
28
|
-
elsif row.is_a?(::Array)
|
29
|
-
array = row.map { |v| v.to_s }
|
30
|
-
yield array if t.config.keep_blank_rows or array.any? { |v| v.present? }
|
31
|
-
end
|
32
|
-
end
|
33
|
-
ensure
|
34
|
-
t.local_file.cleanup
|
35
|
-
end
|
36
|
-
|
37
|
-
private
|
38
|
-
|
39
|
-
FASTERCSV_OPTIONS = [
|
40
|
-
:unconverted_fields,
|
41
|
-
:col_sep,
|
42
|
-
:headers,
|
43
|
-
:row_sep,
|
44
|
-
:return_headers,
|
45
|
-
:header_converters,
|
46
|
-
:quote_char,
|
47
|
-
:skip_blanks,
|
48
|
-
:converters,
|
49
|
-
:force_quotes,
|
50
|
-
]
|
51
|
-
|
52
|
-
def fastercsv_options
|
53
|
-
hsh = t.config.user_specified_options.slice *FASTERCSV_OPTIONS
|
54
|
-
hsh[:skip_blanks] = !t.config.keep_blank_rows
|
55
|
-
hsh.reverse_merge! :headers => t.config.headers
|
56
|
-
hsh.reverse_merge! :col_sep => t.config.delimiter
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
@@ -1,60 +0,0 @@
|
|
1
|
-
class RemoteTable
|
2
|
-
class Format
|
3
|
-
class FixedWidth < Format
|
4
|
-
include Textual
|
5
|
-
def each(&blk)
|
6
|
-
require 'fixed_width-multibyte'
|
7
|
-
|
8
|
-
remove_useless_characters!
|
9
|
-
fix_newlines!
|
10
|
-
transliterate_whole_file_to_utf8!
|
11
|
-
crop_rows!
|
12
|
-
skip_rows!
|
13
|
-
cut_columns!
|
14
|
-
parser.parse[:rows].each do |row|
|
15
|
-
row.reject! { |k, v| k.blank? }
|
16
|
-
row.each do |k, v|
|
17
|
-
row[k] = v.strip
|
18
|
-
end
|
19
|
-
yield row if t.config.keep_blank_rows or row.any? { |k, v| v.present? }
|
20
|
-
end
|
21
|
-
ensure
|
22
|
-
t.local_file.cleanup
|
23
|
-
end
|
24
|
-
|
25
|
-
private
|
26
|
-
|
27
|
-
def parser
|
28
|
-
return @parser if @parser.is_a?(::FixedWidth::Parser)
|
29
|
-
if ::FixedWidth::Section.private_instance_methods.map(&:to_sym).include?(:unpacker)
|
30
|
-
raise ::RuntimeError, "You need a different (newer) version of the FixedWidth gem that supports multibyte encoding, sometime after https://github.com/timonk/fixed_width/pull/1 was incorporated"
|
31
|
-
end
|
32
|
-
@parser = ::FixedWidth::Parser.new definition, t.local_file.encoded_io
|
33
|
-
end
|
34
|
-
|
35
|
-
def definition
|
36
|
-
@definition ||= if t.config.schema_name.is_a?(::String) or t.config.schema_name.is_a?(::Symbol)
|
37
|
-
::FixedWidth.send :definition, t.config.schema_name
|
38
|
-
elsif t.config.schema.is_a?(::Array)
|
39
|
-
everything = lambda { |_| true }
|
40
|
-
srand # in case this was forked by resque
|
41
|
-
::FixedWidth.define(rand.to_s) do |d|
|
42
|
-
d.rows do |row|
|
43
|
-
row.trap(&everything)
|
44
|
-
t.config.schema.each do |name, width, options|
|
45
|
-
name = name.to_s
|
46
|
-
if name == 'spacer'
|
47
|
-
row.spacer width
|
48
|
-
else
|
49
|
-
row.column name, width, options
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
54
|
-
else
|
55
|
-
raise ::ArgumentError, "expecting schema_name to be a String or Symbol, or schema to be an Array"
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
@@ -1,70 +0,0 @@
|
|
1
|
-
class RemoteTable
|
2
|
-
class Format
|
3
|
-
module ProcessedByNokogiri
|
4
|
-
def each
|
5
|
-
require 'nokogiri'
|
6
|
-
require 'cgi'
|
7
|
-
|
8
|
-
raise ::ArgumentError, "Need :row_css or :row_xpath in order to process XML or HTML" unless t.config.row_css or t.config.row_xpath
|
9
|
-
remove_useless_characters!
|
10
|
-
transliterate_whole_file_to_utf8!
|
11
|
-
|
12
|
-
headers = t.config.headers
|
13
|
-
|
14
|
-
xml = nokogiri_class.parse(unescaped_xml_without_soft_hyphens, nil, 'UTF-8')
|
15
|
-
(row_css? ? xml.css(t.config.row_css) : xml.xpath(t.config.row_xpath)).each do |row|
|
16
|
-
values = if column_css?
|
17
|
-
row.css(t.config.column_css)
|
18
|
-
elsif column_xpath?
|
19
|
-
row.xpath(t.config.column_xpath)
|
20
|
-
else
|
21
|
-
[row]
|
22
|
-
end.map { |cell| assume_utf8 cell.content.gsub(/\s+/, ' ').strip }
|
23
|
-
if headers == :first_row
|
24
|
-
headers = values.select(&:present?)
|
25
|
-
next
|
26
|
-
end
|
27
|
-
output = if t.config.output_class == ::Array
|
28
|
-
values
|
29
|
-
else
|
30
|
-
zip headers, values
|
31
|
-
end
|
32
|
-
if t.config.keep_blank_rows or values.any?
|
33
|
-
yield output
|
34
|
-
end
|
35
|
-
end
|
36
|
-
ensure
|
37
|
-
t.local_file.cleanup
|
38
|
-
end
|
39
|
-
|
40
|
-
private
|
41
|
-
|
42
|
-
def row_css?
|
43
|
-
!!t.config.row_css
|
44
|
-
end
|
45
|
-
|
46
|
-
def column_css?
|
47
|
-
!!t.config.column_css
|
48
|
-
end
|
49
|
-
|
50
|
-
def column_xpath?
|
51
|
-
!!t.config.column_xpath
|
52
|
-
end
|
53
|
-
|
54
|
-
# http://snippets.dzone.com/posts/show/406
|
55
|
-
def zip(keys, values)
|
56
|
-
hash = ::ActiveSupport::OrderedHash.new
|
57
|
-
keys.zip(values) { |k,v| hash[k]=v }
|
58
|
-
hash
|
59
|
-
end
|
60
|
-
|
61
|
-
# should we be doing this in ruby?
|
62
|
-
def unescaped_xml_without_soft_hyphens
|
63
|
-
str = ::CGI.unescapeHTML t.local_file.encoded_io.read
|
64
|
-
# get rid of MS Office baddies
|
65
|
-
str.gsub! '­', ''
|
66
|
-
str
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
@@ -1,63 +0,0 @@
|
|
1
|
-
class RemoteTable
|
2
|
-
class Format
|
3
|
-
module ProcessedByRoo
|
4
|
-
def each(&blk)
|
5
|
-
require 'iconv'
|
6
|
-
require 'roo'
|
7
|
-
|
8
|
-
spreadsheet = roo_class.new t.local_file.path, nil, :ignore
|
9
|
-
if t.config.sheet
|
10
|
-
spreadsheet.default_sheet = t.config.sheet
|
11
|
-
end
|
12
|
-
|
13
|
-
first_row = if t.config.crop
|
14
|
-
t.config.crop.first + 1
|
15
|
-
else
|
16
|
-
t.config.skip + 1
|
17
|
-
end
|
18
|
-
|
19
|
-
last_row = if t.config.crop
|
20
|
-
t.config.crop.last
|
21
|
-
else
|
22
|
-
spreadsheet.last_row
|
23
|
-
end
|
24
|
-
|
25
|
-
if t.config.output_class == ::Array
|
26
|
-
(first_row..last_row).each do |y|
|
27
|
-
output = (1..spreadsheet.last_column).map do |x|
|
28
|
-
assume_utf8 spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
|
29
|
-
end
|
30
|
-
yield output if t.config.keep_blank_rows or output.any? { |v| v.present? }
|
31
|
-
end
|
32
|
-
else
|
33
|
-
headers = ::ActiveSupport::OrderedHash.new
|
34
|
-
if t.config.use_first_row_as_header?
|
35
|
-
(1..spreadsheet.last_column).each do |x|
|
36
|
-
v = spreadsheet.cell(first_row, x)
|
37
|
-
v = spreadsheet.cell(first_row - 1, x) if v.blank? # look up
|
38
|
-
if v.present?
|
39
|
-
v = assume_utf8 v
|
40
|
-
headers[v] = x # 'foobar' is found at column 6
|
41
|
-
end
|
42
|
-
end
|
43
|
-
# "advance the cursor"
|
44
|
-
first_row += 1
|
45
|
-
else
|
46
|
-
t.config.headers.each_with_index do |k, i|
|
47
|
-
headers[k] = i + 1
|
48
|
-
end
|
49
|
-
end
|
50
|
-
(first_row..last_row).each do |y|
|
51
|
-
output = ::ActiveSupport::OrderedHash.new
|
52
|
-
headers.each do |k, x|
|
53
|
-
output[k] = assume_utf8 spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
|
54
|
-
end
|
55
|
-
yield output if t.config.keep_blank_rows or output.any? { |k, v| v.present? }
|
56
|
-
end
|
57
|
-
end
|
58
|
-
ensure
|
59
|
-
t.local_file.cleanup
|
60
|
-
end
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
@@ -1,43 +0,0 @@
|
|
1
|
-
require 'fileutils'
|
2
|
-
class RemoteTable
|
3
|
-
class Format
|
4
|
-
module Textual
|
5
|
-
USELESS_CHARACTERS = [
|
6
|
-
'\xef\xbb\xbf', # UTF-8 byte order mark
|
7
|
-
'\xc2\xad', # soft hyphen, often inserted by MS Office (html: ­)
|
8
|
-
]
|
9
|
-
def remove_useless_characters!
|
10
|
-
t.local_file.in_place :perl, "s/#{USELESS_CHARACTERS.join('//g; s/')}//g"
|
11
|
-
if t.config.internal_encoding =~ /windows.?1252/i
|
12
|
-
# soft hyphen again, as I have seen it appear in windows 1252
|
13
|
-
t.local_file.in_place :perl, 's/\xad//g'
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
def transliterate_whole_file_to_utf8!
|
18
|
-
t.local_file.in_place :iconv, t.config.external_encoding_iconv, t.config.internal_encoding
|
19
|
-
t.config.user_specified_options[:encoding] = t.config.external_encoding
|
20
|
-
end
|
21
|
-
|
22
|
-
def fix_newlines!
|
23
|
-
t.local_file.in_place :perl, 's/\r\n|\n|\r/\n/g'
|
24
|
-
end
|
25
|
-
|
26
|
-
def skip_rows!
|
27
|
-
return unless t.config.skip > 0
|
28
|
-
t.local_file.in_place :tail, "+#{t.config.skip + 1}"
|
29
|
-
end
|
30
|
-
|
31
|
-
def crop_rows!
|
32
|
-
return unless t.config.crop
|
33
|
-
t.local_file.in_place :tail, "+#{t.config.crop.first}"
|
34
|
-
t.local_file.in_place :head, (t.config.crop.last - t.config.crop.first + 1)
|
35
|
-
end
|
36
|
-
|
37
|
-
def cut_columns!
|
38
|
-
return unless t.config.cut
|
39
|
-
t.local_file.in_place :cut, t.config.cut
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
43
|
-
end
|
@@ -1,13 +0,0 @@
|
|
1
|
-
class RemoteTable
|
2
|
-
class Format
|
3
|
-
class OpenOffice < Format
|
4
|
-
include ProcessedByRoo
|
5
|
-
def roo_class
|
6
|
-
if ::RUBY_PLATFORM == 'java'
|
7
|
-
::Kernel.warn "[remote_table] Opening ODS files on JRuby is known to fail because of a flaw in the underlying Roo library"
|
8
|
-
end
|
9
|
-
::Openoffice
|
10
|
-
end
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|