remote_table 1.4.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,91 @@
1
+ class RemoteTable
2
+ # Parses plaintext comma-separated (CSV), tab-separated (TSV), or really anything-delimited files using Ruby's CSV parser.
3
+ module Delimited
4
+ # Delimited uses Plaintext.
5
+ def self.extended(base)
6
+ base.extend Plaintext
7
+ end
8
+
9
+ if ::RUBY_VERSION >= '1.9'
10
+ require 'csv'
11
+ Engine = ::CSV
12
+ else
13
+ require 'fastercsv'
14
+ Engine = ::FasterCSV
15
+ end
16
+
17
+ PASSTHROUGH_CSV_SETTINGS = [
18
+ :unconverted_fields,
19
+ :col_sep,
20
+ :row_sep,
21
+ :return_headers,
22
+ :header_converters,
23
+ :quote_char,
24
+ :converters,
25
+ :force_quotes,
26
+ ]
27
+
28
+ # Yield each row using Ruby's CSV parser (FasterCSV on Ruby 1.8).
29
+ def _each
30
+ delete_harmful!
31
+ convert_eol_to_unix!
32
+ transliterate_whole_file_to_utf8!
33
+ skip_rows!
34
+
35
+ Engine.new(local_copy.encoded_io, csv_options).each do |row|
36
+
37
+ some_value_present = false
38
+
39
+ if not headers
40
+
41
+ # represent the row as an array
42
+ array = row.map do |v|
43
+ v = v.to_s
44
+ if not some_value_present and not keep_blank_rows and v.present?
45
+ some_value_present = true
46
+ end
47
+ v
48
+ end
49
+ if some_value_present or keep_blank_rows
50
+ yield array
51
+ end
52
+
53
+ else
54
+
55
+ # represent the row as a hash
56
+ hash = ::ActiveSupport::OrderedHash.new
57
+ row.each do |k, v|
58
+ next unless k.present?
59
+ v = v.to_s
60
+ if not some_value_present and not keep_blank_rows and v.present?
61
+ some_value_present = true
62
+ end
63
+ hash[k] = v
64
+ end
65
+ if some_value_present or keep_blank_rows
66
+ yield hash
67
+ end
68
+
69
+ end
70
+ end
71
+ ensure
72
+ local_copy.cleanup
73
+ end
74
+
75
+ # Passes user-specified options in PASSTHROUGH_CSV_SETTINGS.
76
+ #
77
+ # Also maps:
78
+ # * +:headers+ directly
79
+ # * +:keep_blank_rows+ to the CSV option +:skip_blanks+
80
+ # * +:delimiter+ to the CSV option +:col_sep+
81
+ #
82
+ # @return [Hash]
83
+ def csv_options
84
+ memo = other_options.slice(*PASSTHROUGH_CSV_SETTINGS)
85
+ memo[:skip_blanks] = !keep_blank_rows
86
+ memo[:headers] ||= headers
87
+ memo[:col_sep] ||= delimiter
88
+ memo
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,81 @@
1
+ class RemoteTable
2
+ # Parses plaintext fixed-width files using https://github.com/seamusabshere/fixed_width
3
+ module FixedWidth
4
+ def self.extended(base)
5
+ base.extend Plaintext
6
+ end
7
+
8
+ TRAP_EVERYTHING = proc { |_| true }
9
+
10
+ # @private
11
+ def after_extend
12
+ @parser_mutex = ::Mutex.new
13
+ @definition_mutex = ::Mutex.new
14
+ end
15
+
16
+ def _each
17
+ require 'fixed_width-multibyte'
18
+
19
+ delete_harmful!
20
+ convert_eol_to_unix!
21
+ transliterate_whole_file_to_utf8!
22
+ crop_rows!
23
+ skip_rows!
24
+ cut_columns!
25
+
26
+ parser.parse[:rows].each do |row|
27
+ some_value_present = false
28
+ hash = ::ActiveSupport::OrderedHash.new
29
+ row.each do |k, v|
30
+ v = v.to_s.strip
31
+ if not some_value_present and not keep_blank_rows and v.present?
32
+ some_value_present = true
33
+ end
34
+ hash[k] = v
35
+ end
36
+ if some_value_present or keep_blank_rows
37
+ yield hash
38
+ end
39
+ end
40
+ ensure
41
+ local_copy.cleanup
42
+ end
43
+
44
+ private
45
+
46
+ def parser
47
+ @parser || @parser_mutex.synchronize do
48
+ @parser ||= begin
49
+ if ::FixedWidth::Section.private_instance_methods.map(&:to_sym).include?(:unpacker)
50
+ raise ::RuntimeError, "[remote_table] You need to use exclusively the fixed_width-multibyte library https://github.com/seamusabshere/fixed_width"
51
+ end
52
+ ::FixedWidth::Parser.new definition, local_copy.encoded_io
53
+ end
54
+ end
55
+ end
56
+
57
+ def definition
58
+ @definition || @definition_mutex.synchronize do
59
+ @definition ||= if schema_name.is_a?(::String) or schema_name.is_a?(::Symbol)
60
+ ::FixedWidth.send :definition, schema_name
61
+ elsif schema.is_a?(::Array)
62
+ ::FixedWidth.define("remote_table-fixed_with-#{::Kernel.rand}") do |d|
63
+ d.rows do |row|
64
+ row.trap(&TRAP_EVERYTHING)
65
+ schema.each do |name, width, options|
66
+ name = name.to_s
67
+ if name == 'spacer'
68
+ row.spacer width
69
+ else
70
+ row.column name, width, options
71
+ end
72
+ end
73
+ end
74
+ end
75
+ else
76
+ raise ::ArgumentError, "[remote_table] Expecting :schema_name to be a String or Symbol, or :schema to be an Array"
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,13 @@
1
+ class RemoteTable
2
+ # Parses [X]HTML files using Nokogiri's Nokogiri::HTML::Document class.
3
+ module Html
4
+ def self.extended(base)
5
+ base.extend Plaintext
6
+ base.extend ProcessedByNokogiri
7
+ end
8
+
9
+ def nokogiri_class
10
+ ::Nokogiri::HTML::Document
11
+ end
12
+ end
13
+ end
@@ -2,7 +2,7 @@ require 'fileutils'
2
2
  require 'unix_utils'
3
3
 
4
4
  class RemoteTable
5
- class LocalFile #:nodoc:all
5
+ class LocalCopy #:nodoc:all
6
6
  class << self
7
7
  def decompress(input, compression)
8
8
  output = case compression
@@ -59,6 +59,8 @@ class RemoteTable
59
59
 
60
60
  def initialize(t)
61
61
  @t = t
62
+ @encoded_io_mutex = ::Mutex.new
63
+ @generate_mutex = ::Mutex.new
62
64
  end
63
65
 
64
66
  def in_place(*args)
@@ -68,15 +70,17 @@ class RemoteTable
68
70
  end
69
71
 
70
72
  def path
71
- generate unless generated?
73
+ generate unless @generated
72
74
  @path
73
75
  end
74
76
 
75
77
  def encoded_io
76
- @encoded_io ||= if ::RUBY_VERSION >= '1.9'
77
- ::File.open path, 'rb', :internal_encoding => t.config.internal_encoding, :external_encoding => t.config.external_encoding
78
- else
79
- ::File.open path, 'rb'
78
+ @encoded_io || @encoded_io_mutex.synchronize do
79
+ @encoded_io ||= if ::RUBY_VERSION >= '1.9'
80
+ ::File.open path, 'rb', :internal_encoding => t.internal_encoding, :external_encoding => RemoteTable::EXTERNAL_ENCODING
81
+ else
82
+ ::File.open path, 'rb'
83
+ end
80
84
  end
81
85
  end
82
86
 
@@ -94,24 +98,24 @@ class RemoteTable
94
98
 
95
99
  private
96
100
 
97
- def generated?
98
- @generated == true
99
- end
100
-
101
101
  def generate
102
- # sabshere 7/20/11 make web requests move more slowly so you don't get accused of DOS
103
- if ::ENV.has_key?('REMOTE_TABLE_DELAY_BETWEEN_REQUESTS')
104
- ::Kernel.sleep ::ENV['REMOTE_TABLE_DELAY_BETWEEN_REQUESTS'].to_i
105
- end
106
- tmp_path = ::UnixUtils.curl t.config.uri.to_s, t.config.form_data
107
- if compression = t.config.compression
108
- tmp_path = LocalFile.decompress tmp_path, compression
109
- end
110
- if packing = t.config.packing
111
- tmp_path = LocalFile.unpack tmp_path, packing
102
+ return if @generated
103
+ @generate_mutex.synchronize do
104
+ return if @generated
105
+ @generated = true
106
+ # sabshere 7/20/11 make web requests move more slowly so you don't get accused of DOS
107
+ if ::ENV.has_key?('REMOTE_TABLE_DELAY_BETWEEN_REQUESTS')
108
+ ::Kernel.sleep ::ENV['REMOTE_TABLE_DELAY_BETWEEN_REQUESTS'].to_i
109
+ end
110
+ tmp_path = ::UnixUtils.curl t.url, t.form_data
111
+ if compression = t.compression
112
+ tmp_path = LocalCopy.decompress tmp_path, compression
113
+ end
114
+ if packing = t.packing
115
+ tmp_path = LocalCopy.unpack tmp_path, packing
116
+ end
117
+ @path = LocalCopy.pick tmp_path, :filename => t.filename, :glob => t.glob
112
118
  end
113
- @path = LocalFile.pick tmp_path, :filename => t.config.filename, :glob => t.config.glob
114
- @generated = true
115
119
  end
116
120
  end
117
121
  end
@@ -0,0 +1,17 @@
1
+ class RemoteTable
2
+ # Parses ODS files using Roo's Openoffice class.
3
+ #
4
+ # Know to have issues on JRuby.
5
+ module Ods
6
+ def self.extended(base)
7
+ base.extend ProcessedByRoo
8
+ end
9
+
10
+ def roo_class
11
+ if ::RUBY_PLATFORM == 'java'
12
+ ::Kernel.warn "[remote_table] Opening ODS files on JRuby is known to fail because of a flaw in the underlying Roo library"
13
+ end
14
+ ::Openoffice
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,67 @@
1
+ require 'fileutils'
2
+
3
+ class RemoteTable
4
+ # Helper methods that act on plaintext files before they are parsed
5
+ module Plaintext
6
+ CONSIDERED_HARMFUL = [
7
+ '\xef\xbb\xbf', # UTF-8 byte order mark
8
+ '\xc2\xad', # soft hyphen, often inserted by MS Office (html: &shy;)
9
+ '\xad' # any remaining soft hyphens (sometimes seen in windows-1252)
10
+ ]
11
+ EOL_TO_UNIX = 's/\r\n|\n|\r/\n/g'
12
+
13
+ # Remove bytes that are both useless and harmful in the vast majority of cases.
14
+ def delete_harmful!
15
+ local_copy.in_place :perl, "s/#{CONSIDERED_HARMFUL.join('//g; s/')}//g"
16
+ end
17
+
18
+ # No matter what the file encoding is SUPPOSED to be, run it through iconv to make sure it's UTF-8
19
+ #
20
+ # @example
21
+ # iconv -c -t UTF-8//TRANSLIT -f WINDOWS-1252
22
+ def transliterate_whole_file_to_utf8!
23
+ local_copy.in_place :iconv, RemoteTable::EXTERNAL_ENCODING_ICONV, internal_encoding
24
+ # now that we've force-transliterated to UTF-8, act as though this is what the user had specified
25
+ @internal_encoding = RemoteTable::EXTERNAL_ENCODING
26
+ end
27
+
28
+ # No matter what the EOL are SUPPOSED to be, run it through Perl with a regex that will convert all EOLS to \n
29
+ #
30
+ # @example
31
+ # perl -pe 's/\r\n|\n|\r/\n/g'
32
+ def convert_eol_to_unix!
33
+ local_copy.in_place :perl, EOL_TO_UNIX
34
+ end
35
+
36
+ # If the user has specified :skip, use tail
37
+ #
38
+ # @example :skip => 6
39
+ # tail +7
40
+ def skip_rows!
41
+ if skip > 0
42
+ local_copy.in_place :tail, "+#{skip + 1}"
43
+ end
44
+ end
45
+
46
+ # If the user has specified :crop, use a combination of tail and head
47
+ #
48
+ # @example :crop => (184..263)
49
+ # tail +184 | head 80
50
+ def crop_rows!
51
+ if crop
52
+ local_copy.in_place :tail, "+#{crop.first}"
53
+ local_copy.in_place :head, (crop.last - crop.first + 1)
54
+ end
55
+ end
56
+
57
+ # If the user has specified :cut, use cut
58
+ #
59
+ # @example :cut => '13-'
60
+ # cut -c 13-
61
+ def cut_columns!
62
+ if cut
63
+ local_copy.in_place :cut, cut
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,76 @@
1
+ class RemoteTable
2
+ # Mixed in to process XML and XHTML.
3
+ module ProcessedByNokogiri
4
+ WHITESPACE = /\s+/
5
+ SINGLE_SPACE = ' '
6
+ SOFT_HYPHEN = '&shy;'
7
+
8
+ # Yield each row using Nokogiri.
9
+ def _each
10
+ require 'nokogiri'
11
+ require 'cgi'
12
+
13
+ # save this to a local var because we modify it in the loop
14
+ current_headers = headers
15
+
16
+ unless row_css or row_xpath
17
+ raise ::ArgumentError, "[remote_table] Need :row_css or :row_xpath in order to process XML or HTML"
18
+ end
19
+
20
+ delete_harmful!
21
+ transliterate_whole_file_to_utf8!
22
+
23
+ xml = nokogiri_class.parse(unescaped_xml_without_soft_hyphens, nil, RemoteTable::EXTERNAL_ENCODING)
24
+ (row_css ? xml.css(row_css) : xml.xpath(row_xpath)).each do |row|
25
+ some_value_present = false
26
+ values = if column_css
27
+ row.css column_css
28
+ elsif column_xpath
29
+ row.xpath column_xpath
30
+ else
31
+ [row]
32
+ end.map do |cell|
33
+ memo = cell.content.dup
34
+ memo = assume_utf8 memo
35
+ memo.gsub! WHITESPACE, SINGLE_SPACE
36
+ memo.strip!
37
+ if not some_value_present and not keep_blank_rows and memo.present?
38
+ some_value_present = true
39
+ end
40
+ memo
41
+ end
42
+ if current_headers == :first_row
43
+ current_headers = values.select(&:present?)
44
+ next
45
+ end
46
+ if keep_blank_rows or some_value_present
47
+ if not headers
48
+ yield values
49
+ else
50
+ yield zip(current_headers, values)
51
+ end
52
+ end
53
+ end
54
+ ensure
55
+ local_copy.cleanup
56
+ end
57
+
58
+ private
59
+
60
+ # http://snippets.dzone.com/posts/show/406
61
+ def zip(keys, values)
62
+ hash = ::ActiveSupport::OrderedHash.new
63
+ keys.zip(values) { |k,v| hash[k]=v }
64
+ hash
65
+ end
66
+
67
+ # should we be doing this in ruby?
68
+ def unescaped_xml_without_soft_hyphens
69
+ str = ::CGI.unescapeHTML local_copy.encoded_io.read
70
+ local_copy.encoded_io.rewind
71
+ # get rid of MS Office baddies
72
+ str.gsub! SOFT_HYPHEN, ''
73
+ str
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,97 @@
1
+ class RemoteTable
2
+ # Mixed in to process XLS, XLSX, and ODS with the Roo library.
3
+ module ProcessedByRoo
4
+ TAG = /<[^>]+>/
5
+ BLANK = ''
6
+
7
+ # Yield each row using Roo.
8
+ def _each
9
+ # sometimes Roo forgets to require iconv.
10
+ require 'iconv'
11
+ require 'roo'
12
+
13
+ spreadsheet = roo_class.new local_copy.path, nil, :ignore
14
+ if sheet
15
+ spreadsheet.default_sheet = sheet
16
+ end
17
+
18
+ first_row = if crop
19
+ crop.first + 1
20
+ else
21
+ skip + 1
22
+ end
23
+
24
+ last_row = if crop
25
+ crop.last
26
+ else
27
+ spreadsheet.last_row
28
+ end
29
+
30
+ if not headers
31
+
32
+ # create an array to represent this row
33
+ (first_row..last_row).each do |y|
34
+ some_value_present = false
35
+ output = (1..spreadsheet.last_column).map do |x|
36
+ memo = spreadsheet.cell(y, x).to_s.dup
37
+ memo = assume_utf8 memo
38
+ memo.gsub! TAG, BLANK
39
+ memo.strip!
40
+ if not some_value_present and not keep_blank_rows and memo.present?
41
+ some_value_present = true
42
+ end
43
+ memo
44
+ end
45
+ if keep_blank_rows or some_value_present
46
+ yield output
47
+ end
48
+ end
49
+
50
+ else
51
+
52
+ # create a hash to represent this row
53
+ current_headers = ::ActiveSupport::OrderedHash.new
54
+ if headers == :first_row
55
+ (1..spreadsheet.last_column).each do |x|
56
+ v = spreadsheet.cell(first_row, x)
57
+ if v.blank?
58
+ # then look up one
59
+ v = spreadsheet.cell(first_row - 1, x)
60
+ end
61
+ if v.present?
62
+ v = assume_utf8 v
63
+ # 'foobar' is found at column 6
64
+ current_headers[v] = x
65
+ end
66
+ end
67
+ # "advance the cursor"
68
+ first_row += 1
69
+ else
70
+ headers.each_with_index do |k, i|
71
+ current_headers[k] = i + 1
72
+ end
73
+ end
74
+ (first_row..last_row).each do |y|
75
+ some_value_present = false
76
+ output = ::ActiveSupport::OrderedHash.new
77
+ current_headers.each do |k, x|
78
+ memo = spreadsheet.cell(y, x).to_s.dup
79
+ memo = assume_utf8 memo
80
+ memo.gsub! TAG, BLANK
81
+ memo.strip!
82
+ if not some_value_present and not keep_blank_rows and memo.present?
83
+ some_value_present = true
84
+ end
85
+ output[k] = memo
86
+ end
87
+ if keep_blank_rows or some_value_present
88
+ yield output
89
+ end
90
+ end
91
+
92
+ end
93
+ ensure
94
+ local_copy.cleanup
95
+ end
96
+ end
97
+ end