remote_table 1.4.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,49 +0,0 @@
1
- require 'iconv'
2
- if RUBY_VERSION >= '1.9'
3
- # for an excellent explanation see http://blog.segment7.net/2010/12/17/from-iconv-iconv-to-string-encode
4
- Kernel.warn "[remote_table] Apologies - using iconv because Ruby 1.9.x's String#encode doesn't have transliteration tables (yet)"
5
- end
6
-
7
- require 'remote_table/format/mixins/textual'
8
- require 'remote_table/format/mixins/processed_by_roo'
9
- require 'remote_table/format/mixins/processed_by_nokogiri'
10
- require 'remote_table/format/excel'
11
- require 'remote_table/format/excelx'
12
- require 'remote_table/format/delimited'
13
- require 'remote_table/format/open_office'
14
- require 'remote_table/format/fixed_width'
15
- require 'remote_table/format/html'
16
- require 'remote_table/format/xml'
17
- require 'remote_table/format/yaml'
18
- class RemoteTable
19
- class Format
20
-
21
- attr_reader :t
22
-
23
- def initialize(t)
24
- @t = t
25
- end
26
-
27
- def transliterate_to_utf8(str)
28
- if str.is_a?(::String)
29
- [ iconv.iconv(str), iconv.iconv(nil) ].join
30
- end
31
- end
32
-
33
- def assume_utf8(str)
34
- if str.is_a?(::String) and ::RUBY_VERSION >= '1.9'
35
- str.encode! t.config.external_encoding
36
- else
37
- str
38
- end
39
- end
40
-
41
- private
42
-
43
- def iconv
44
- @iconv ||= ::Iconv.new(t.config.external_encoding_iconv, t.config.internal_encoding)
45
- end
46
-
47
- include ::Enumerable
48
- end
49
- end
@@ -1,60 +0,0 @@
1
- class RemoteTable
2
- class Format
3
- class Delimited < Format
4
- if ::RUBY_VERSION >= '1.9'
5
- require 'csv'
6
- Engine = ::CSV
7
- else
8
- require 'fastercsv'
9
- Engine = ::FasterCSV
10
- end
11
-
12
- include Textual
13
-
14
- def each(&blk)
15
- remove_useless_characters!
16
- fix_newlines!
17
- transliterate_whole_file_to_utf8!
18
- skip_rows!
19
- Engine.new(t.local_file.encoded_io, fastercsv_options).each do |row|
20
- if row.is_a?(Engine::Row)
21
- hash = row.inject(::ActiveSupport::OrderedHash.new) do |memo, (k, v)|
22
- if k.present?
23
- memo[k] = v.to_s
24
- end
25
- memo
26
- end
27
- yield hash if t.config.keep_blank_rows or hash.any? { |k, v| v.present? }
28
- elsif row.is_a?(::Array)
29
- array = row.map { |v| v.to_s }
30
- yield array if t.config.keep_blank_rows or array.any? { |v| v.present? }
31
- end
32
- end
33
- ensure
34
- t.local_file.cleanup
35
- end
36
-
37
- private
38
-
39
- FASTERCSV_OPTIONS = [
40
- :unconverted_fields,
41
- :col_sep,
42
- :headers,
43
- :row_sep,
44
- :return_headers,
45
- :header_converters,
46
- :quote_char,
47
- :skip_blanks,
48
- :converters,
49
- :force_quotes,
50
- ]
51
-
52
- def fastercsv_options
53
- hsh = t.config.user_specified_options.slice *FASTERCSV_OPTIONS
54
- hsh[:skip_blanks] = !t.config.keep_blank_rows
55
- hsh.reverse_merge! :headers => t.config.headers
56
- hsh.reverse_merge! :col_sep => t.config.delimiter
57
- end
58
- end
59
- end
60
- end
@@ -1,10 +0,0 @@
1
- class RemoteTable
2
- class Format
3
- class Excel < Format
4
- include ProcessedByRoo
5
- def roo_class
6
- ::Excel
7
- end
8
- end
9
- end
10
- end
@@ -1,10 +0,0 @@
1
- class RemoteTable
2
- class Format
3
- class Excelx < Format
4
- include ProcessedByRoo
5
- def roo_class
6
- ::Excelx
7
- end
8
- end
9
- end
10
- end
@@ -1,60 +0,0 @@
1
- class RemoteTable
2
- class Format
3
- class FixedWidth < Format
4
- include Textual
5
- def each(&blk)
6
- require 'fixed_width-multibyte'
7
-
8
- remove_useless_characters!
9
- fix_newlines!
10
- transliterate_whole_file_to_utf8!
11
- crop_rows!
12
- skip_rows!
13
- cut_columns!
14
- parser.parse[:rows].each do |row|
15
- row.reject! { |k, v| k.blank? }
16
- row.each do |k, v|
17
- row[k] = v.strip
18
- end
19
- yield row if t.config.keep_blank_rows or row.any? { |k, v| v.present? }
20
- end
21
- ensure
22
- t.local_file.cleanup
23
- end
24
-
25
- private
26
-
27
- def parser
28
- return @parser if @parser.is_a?(::FixedWidth::Parser)
29
- if ::FixedWidth::Section.private_instance_methods.map(&:to_sym).include?(:unpacker)
30
- raise ::RuntimeError, "You need a different (newer) version of the FixedWidth gem that supports multibyte encoding, sometime after https://github.com/timonk/fixed_width/pull/1 was incorporated"
31
- end
32
- @parser = ::FixedWidth::Parser.new definition, t.local_file.encoded_io
33
- end
34
-
35
- def definition
36
- @definition ||= if t.config.schema_name.is_a?(::String) or t.config.schema_name.is_a?(::Symbol)
37
- ::FixedWidth.send :definition, t.config.schema_name
38
- elsif t.config.schema.is_a?(::Array)
39
- everything = lambda { |_| true }
40
- srand # in case this was forked by resque
41
- ::FixedWidth.define(rand.to_s) do |d|
42
- d.rows do |row|
43
- row.trap(&everything)
44
- t.config.schema.each do |name, width, options|
45
- name = name.to_s
46
- if name == 'spacer'
47
- row.spacer width
48
- else
49
- row.column name, width, options
50
- end
51
- end
52
- end
53
- end
54
- else
55
- raise ::ArgumentError, "expecting schema_name to be a String or Symbol, or schema to be an Array"
56
- end
57
- end
58
- end
59
- end
60
- end
@@ -1,12 +0,0 @@
1
- class RemoteTable
2
- class Format
3
- class HTML < Format
4
- include Textual
5
- include ProcessedByNokogiri
6
-
7
- def nokogiri_class
8
- ::Nokogiri::HTML::Document
9
- end
10
- end
11
- end
12
- end
@@ -1,70 +0,0 @@
1
- class RemoteTable
2
- class Format
3
- module ProcessedByNokogiri
4
- def each
5
- require 'nokogiri'
6
- require 'cgi'
7
-
8
- raise ::ArgumentError, "Need :row_css or :row_xpath in order to process XML or HTML" unless t.config.row_css or t.config.row_xpath
9
- remove_useless_characters!
10
- transliterate_whole_file_to_utf8!
11
-
12
- headers = t.config.headers
13
-
14
- xml = nokogiri_class.parse(unescaped_xml_without_soft_hyphens, nil, 'UTF-8')
15
- (row_css? ? xml.css(t.config.row_css) : xml.xpath(t.config.row_xpath)).each do |row|
16
- values = if column_css?
17
- row.css(t.config.column_css)
18
- elsif column_xpath?
19
- row.xpath(t.config.column_xpath)
20
- else
21
- [row]
22
- end.map { |cell| assume_utf8 cell.content.gsub(/\s+/, ' ').strip }
23
- if headers == :first_row
24
- headers = values.select(&:present?)
25
- next
26
- end
27
- output = if t.config.output_class == ::Array
28
- values
29
- else
30
- zip headers, values
31
- end
32
- if t.config.keep_blank_rows or values.any?
33
- yield output
34
- end
35
- end
36
- ensure
37
- t.local_file.cleanup
38
- end
39
-
40
- private
41
-
42
- def row_css?
43
- !!t.config.row_css
44
- end
45
-
46
- def column_css?
47
- !!t.config.column_css
48
- end
49
-
50
- def column_xpath?
51
- !!t.config.column_xpath
52
- end
53
-
54
- # http://snippets.dzone.com/posts/show/406
55
- def zip(keys, values)
56
- hash = ::ActiveSupport::OrderedHash.new
57
- keys.zip(values) { |k,v| hash[k]=v }
58
- hash
59
- end
60
-
61
- # should we be doing this in ruby?
62
- def unescaped_xml_without_soft_hyphens
63
- str = ::CGI.unescapeHTML t.local_file.encoded_io.read
64
- # get rid of MS Office baddies
65
- str.gsub! '&shy;', ''
66
- str
67
- end
68
- end
69
- end
70
- end
@@ -1,63 +0,0 @@
1
- class RemoteTable
2
- class Format
3
- module ProcessedByRoo
4
- def each(&blk)
5
- require 'iconv'
6
- require 'roo'
7
-
8
- spreadsheet = roo_class.new t.local_file.path, nil, :ignore
9
- if t.config.sheet
10
- spreadsheet.default_sheet = t.config.sheet
11
- end
12
-
13
- first_row = if t.config.crop
14
- t.config.crop.first + 1
15
- else
16
- t.config.skip + 1
17
- end
18
-
19
- last_row = if t.config.crop
20
- t.config.crop.last
21
- else
22
- spreadsheet.last_row
23
- end
24
-
25
- if t.config.output_class == ::Array
26
- (first_row..last_row).each do |y|
27
- output = (1..spreadsheet.last_column).map do |x|
28
- assume_utf8 spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
29
- end
30
- yield output if t.config.keep_blank_rows or output.any? { |v| v.present? }
31
- end
32
- else
33
- headers = ::ActiveSupport::OrderedHash.new
34
- if t.config.use_first_row_as_header?
35
- (1..spreadsheet.last_column).each do |x|
36
- v = spreadsheet.cell(first_row, x)
37
- v = spreadsheet.cell(first_row - 1, x) if v.blank? # look up
38
- if v.present?
39
- v = assume_utf8 v
40
- headers[v] = x # 'foobar' is found at column 6
41
- end
42
- end
43
- # "advance the cursor"
44
- first_row += 1
45
- else
46
- t.config.headers.each_with_index do |k, i|
47
- headers[k] = i + 1
48
- end
49
- end
50
- (first_row..last_row).each do |y|
51
- output = ::ActiveSupport::OrderedHash.new
52
- headers.each do |k, x|
53
- output[k] = assume_utf8 spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
54
- end
55
- yield output if t.config.keep_blank_rows or output.any? { |k, v| v.present? }
56
- end
57
- end
58
- ensure
59
- t.local_file.cleanup
60
- end
61
- end
62
- end
63
- end
@@ -1,43 +0,0 @@
1
- require 'fileutils'
2
- class RemoteTable
3
- class Format
4
- module Textual
5
- USELESS_CHARACTERS = [
6
- '\xef\xbb\xbf', # UTF-8 byte order mark
7
- '\xc2\xad', # soft hyphen, often inserted by MS Office (html: &shy;)
8
- ]
9
- def remove_useless_characters!
10
- t.local_file.in_place :perl, "s/#{USELESS_CHARACTERS.join('//g; s/')}//g"
11
- if t.config.internal_encoding =~ /windows.?1252/i
12
- # soft hyphen again, as I have seen it appear in windows 1252
13
- t.local_file.in_place :perl, 's/\xad//g'
14
- end
15
- end
16
-
17
- def transliterate_whole_file_to_utf8!
18
- t.local_file.in_place :iconv, t.config.external_encoding_iconv, t.config.internal_encoding
19
- t.config.user_specified_options[:encoding] = t.config.external_encoding
20
- end
21
-
22
- def fix_newlines!
23
- t.local_file.in_place :perl, 's/\r\n|\n|\r/\n/g'
24
- end
25
-
26
- def skip_rows!
27
- return unless t.config.skip > 0
28
- t.local_file.in_place :tail, "+#{t.config.skip + 1}"
29
- end
30
-
31
- def crop_rows!
32
- return unless t.config.crop
33
- t.local_file.in_place :tail, "+#{t.config.crop.first}"
34
- t.local_file.in_place :head, (t.config.crop.last - t.config.crop.first + 1)
35
- end
36
-
37
- def cut_columns!
38
- return unless t.config.cut
39
- t.local_file.in_place :cut, t.config.cut
40
- end
41
- end
42
- end
43
- end
@@ -1,13 +0,0 @@
1
- class RemoteTable
2
- class Format
3
- class OpenOffice < Format
4
- include ProcessedByRoo
5
- def roo_class
6
- if ::RUBY_PLATFORM == 'java'
7
- ::Kernel.warn "[remote_table] Opening ODS files on JRuby is known to fail because of a flaw in the underlying Roo library"
8
- end
9
- ::Openoffice
10
- end
11
- end
12
- end
13
- end
@@ -1,12 +0,0 @@
1
- class RemoteTable
2
- class Format
3
- class XML < Format
4
- include Textual
5
- include ProcessedByNokogiri
6
-
7
- def nokogiri_class
8
- ::Nokogiri::XML::Document
9
- end
10
- end
11
- end
12
- end
@@ -1,14 +0,0 @@
1
- require 'yaml'
2
-
3
- class RemoteTable
4
- class Format
5
- class Yaml < Format
6
- def each(&blk)
7
- data = YAML.load_file t.local_file.path
8
- data.each &blk
9
- ensure
10
- t.local_file.cleanup
11
- end
12
- end
13
- end
14
- end