remote_table 1.4.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,91 @@
1
+ class RemoteTable
2
+ # Parses plaintext comma-separated (CSV), tab-separated (TSV), or really anything-delimited files using Ruby's CSV parser.
3
+ module Delimited
4
+ # Delimited uses Plaintext.
5
+ def self.extended(base)
6
+ base.extend Plaintext
7
+ end
8
+
9
+ if ::RUBY_VERSION >= '1.9'
10
+ require 'csv'
11
+ Engine = ::CSV
12
+ else
13
+ require 'fastercsv'
14
+ Engine = ::FasterCSV
15
+ end
16
+
17
+ PASSTHROUGH_CSV_SETTINGS = [
18
+ :unconverted_fields,
19
+ :col_sep,
20
+ :row_sep,
21
+ :return_headers,
22
+ :header_converters,
23
+ :quote_char,
24
+ :converters,
25
+ :force_quotes,
26
+ ]
27
+
28
+ # Yield each row using Ruby's CSV parser (FasterCSV on Ruby 1.8).
29
+ def _each
30
+ delete_harmful!
31
+ convert_eol_to_unix!
32
+ transliterate_whole_file_to_utf8!
33
+ skip_rows!
34
+
35
+ Engine.new(local_copy.encoded_io, csv_options).each do |row|
36
+
37
+ some_value_present = false
38
+
39
+ if not headers
40
+
41
+ # represent the row as an array
42
+ array = row.map do |v|
43
+ v = v.to_s
44
+ if not some_value_present and not keep_blank_rows and v.present?
45
+ some_value_present = true
46
+ end
47
+ v
48
+ end
49
+ if some_value_present or keep_blank_rows
50
+ yield array
51
+ end
52
+
53
+ else
54
+
55
+ # represent the row as a hash
56
+ hash = ::ActiveSupport::OrderedHash.new
57
+ row.each do |k, v|
58
+ next unless k.present?
59
+ v = v.to_s
60
+ if not some_value_present and not keep_blank_rows and v.present?
61
+ some_value_present = true
62
+ end
63
+ hash[k] = v
64
+ end
65
+ if some_value_present or keep_blank_rows
66
+ yield hash
67
+ end
68
+
69
+ end
70
+ end
71
+ ensure
72
+ local_copy.cleanup
73
+ end
74
+
75
+ # Passes user-specified options in PASSTHROUGH_CSV_SETTINGS.
76
+ #
77
+ # Also maps:
78
+ # * +:headers+ directly
79
+ # * +:keep_blank_rows+ to the CSV option +:skip_blanks+
80
+ # * +:delimiter+ to the CSV option +:col_sep+
81
+ #
82
+ # @return [Hash]
83
+ def csv_options
84
+ memo = other_options.slice(*PASSTHROUGH_CSV_SETTINGS)
85
+ memo[:skip_blanks] = !keep_blank_rows
86
+ memo[:headers] ||= headers
87
+ memo[:col_sep] ||= delimiter
88
+ memo
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,81 @@
1
+ class RemoteTable
2
+ # Parses plaintext fixed-width files using https://github.com/seamusabshere/fixed_width
3
+ module FixedWidth
4
+ def self.extended(base)
5
+ base.extend Plaintext
6
+ end
7
+
8
+ TRAP_EVERYTHING = proc { |_| true }
9
+
10
+ # @private
11
+ def after_extend
12
+ @parser_mutex = ::Mutex.new
13
+ @definition_mutex = ::Mutex.new
14
+ end
15
+
16
+ def _each
17
+ require 'fixed_width-multibyte'
18
+
19
+ delete_harmful!
20
+ convert_eol_to_unix!
21
+ transliterate_whole_file_to_utf8!
22
+ crop_rows!
23
+ skip_rows!
24
+ cut_columns!
25
+
26
+ parser.parse[:rows].each do |row|
27
+ some_value_present = false
28
+ hash = ::ActiveSupport::OrderedHash.new
29
+ row.each do |k, v|
30
+ v = v.to_s.strip
31
+ if not some_value_present and not keep_blank_rows and v.present?
32
+ some_value_present = true
33
+ end
34
+ hash[k] = v
35
+ end
36
+ if some_value_present or keep_blank_rows
37
+ yield hash
38
+ end
39
+ end
40
+ ensure
41
+ local_copy.cleanup
42
+ end
43
+
44
+ private
45
+
46
+ def parser
47
+ @parser || @parser_mutex.synchronize do
48
+ @parser ||= begin
49
+ if ::FixedWidth::Section.private_instance_methods.map(&:to_sym).include?(:unpacker)
50
+ raise ::RuntimeError, "[remote_table] You need to use exclusively the fixed_width-multibyte library https://github.com/seamusabshere/fixed_width"
51
+ end
52
+ ::FixedWidth::Parser.new definition, local_copy.encoded_io
53
+ end
54
+ end
55
+ end
56
+
57
+ def definition
58
+ @definition || @definition_mutex.synchronize do
59
+ @definition ||= if schema_name.is_a?(::String) or schema_name.is_a?(::Symbol)
60
+ ::FixedWidth.send :definition, schema_name
61
+ elsif schema.is_a?(::Array)
62
+ ::FixedWidth.define("remote_table-fixed_with-#{::Kernel.rand}") do |d|
63
+ d.rows do |row|
64
+ row.trap(&TRAP_EVERYTHING)
65
+ schema.each do |name, width, options|
66
+ name = name.to_s
67
+ if name == 'spacer'
68
+ row.spacer width
69
+ else
70
+ row.column name, width, options
71
+ end
72
+ end
73
+ end
74
+ end
75
+ else
76
+ raise ::ArgumentError, "[remote_table] Expecting :schema_name to be a String or Symbol, or :schema to be an Array"
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,13 @@
1
+ class RemoteTable
2
+ # Parses [X]HTML files using Nokogiri's Nokogiri::HTML::Document class.
3
+ module Html
4
+ def self.extended(base)
5
+ base.extend Plaintext
6
+ base.extend ProcessedByNokogiri
7
+ end
8
+
9
+ def nokogiri_class
10
+ ::Nokogiri::HTML::Document
11
+ end
12
+ end
13
+ end
@@ -2,7 +2,7 @@ require 'fileutils'
2
2
  require 'unix_utils'
3
3
 
4
4
  class RemoteTable
5
- class LocalFile #:nodoc:all
5
+ class LocalCopy #:nodoc:all
6
6
  class << self
7
7
  def decompress(input, compression)
8
8
  output = case compression
@@ -59,6 +59,8 @@ class RemoteTable
59
59
 
60
60
  def initialize(t)
61
61
  @t = t
62
+ @encoded_io_mutex = ::Mutex.new
63
+ @generate_mutex = ::Mutex.new
62
64
  end
63
65
 
64
66
  def in_place(*args)
@@ -68,15 +70,17 @@ class RemoteTable
68
70
  end
69
71
 
70
72
  def path
71
- generate unless generated?
73
+ generate unless @generated
72
74
  @path
73
75
  end
74
76
 
75
77
  def encoded_io
76
- @encoded_io ||= if ::RUBY_VERSION >= '1.9'
77
- ::File.open path, 'rb', :internal_encoding => t.config.internal_encoding, :external_encoding => t.config.external_encoding
78
- else
79
- ::File.open path, 'rb'
78
+ @encoded_io || @encoded_io_mutex.synchronize do
79
+ @encoded_io ||= if ::RUBY_VERSION >= '1.9'
80
+ ::File.open path, 'rb', :internal_encoding => t.internal_encoding, :external_encoding => RemoteTable::EXTERNAL_ENCODING
81
+ else
82
+ ::File.open path, 'rb'
83
+ end
80
84
  end
81
85
  end
82
86
 
@@ -94,24 +98,24 @@ class RemoteTable
94
98
 
95
99
  private
96
100
 
97
- def generated?
98
- @generated == true
99
- end
100
-
101
101
  def generate
102
- # sabshere 7/20/11 make web requests move more slowly so you don't get accused of DOS
103
- if ::ENV.has_key?('REMOTE_TABLE_DELAY_BETWEEN_REQUESTS')
104
- ::Kernel.sleep ::ENV['REMOTE_TABLE_DELAY_BETWEEN_REQUESTS'].to_i
105
- end
106
- tmp_path = ::UnixUtils.curl t.config.uri.to_s, t.config.form_data
107
- if compression = t.config.compression
108
- tmp_path = LocalFile.decompress tmp_path, compression
109
- end
110
- if packing = t.config.packing
111
- tmp_path = LocalFile.unpack tmp_path, packing
102
+ return if @generated
103
+ @generate_mutex.synchronize do
104
+ return if @generated
105
+ @generated = true
106
+ # sabshere 7/20/11 make web requests move more slowly so you don't get accused of DOS
107
+ if ::ENV.has_key?('REMOTE_TABLE_DELAY_BETWEEN_REQUESTS')
108
+ ::Kernel.sleep ::ENV['REMOTE_TABLE_DELAY_BETWEEN_REQUESTS'].to_i
109
+ end
110
+ tmp_path = ::UnixUtils.curl t.url, t.form_data
111
+ if compression = t.compression
112
+ tmp_path = LocalCopy.decompress tmp_path, compression
113
+ end
114
+ if packing = t.packing
115
+ tmp_path = LocalCopy.unpack tmp_path, packing
116
+ end
117
+ @path = LocalCopy.pick tmp_path, :filename => t.filename, :glob => t.glob
112
118
  end
113
- @path = LocalFile.pick tmp_path, :filename => t.config.filename, :glob => t.config.glob
114
- @generated = true
115
119
  end
116
120
  end
117
121
  end
@@ -0,0 +1,17 @@
1
+ class RemoteTable
2
+ # Parses ODS files using Roo's Openoffice class.
3
+ #
4
+ # Know to have issues on JRuby.
5
+ module Ods
6
+ def self.extended(base)
7
+ base.extend ProcessedByRoo
8
+ end
9
+
10
+ def roo_class
11
+ if ::RUBY_PLATFORM == 'java'
12
+ ::Kernel.warn "[remote_table] Opening ODS files on JRuby is known to fail because of a flaw in the underlying Roo library"
13
+ end
14
+ ::Openoffice
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,67 @@
1
+ require 'fileutils'
2
+
3
+ class RemoteTable
4
+ # Helper methods that act on plaintext files before they are parsed
5
+ module Plaintext
6
+ CONSIDERED_HARMFUL = [
7
+ '\xef\xbb\xbf', # UTF-8 byte order mark
8
+ '\xc2\xad', # soft hyphen, often inserted by MS Office (html: &shy;)
9
+ '\xad' # any remaining soft hyphens (sometimes seen in windows-1252)
10
+ ]
11
+ EOL_TO_UNIX = 's/\r\n|\n|\r/\n/g'
12
+
13
+ # Remove bytes that are both useless and harmful in the vast majority of cases.
14
+ def delete_harmful!
15
+ local_copy.in_place :perl, "s/#{CONSIDERED_HARMFUL.join('//g; s/')}//g"
16
+ end
17
+
18
+ # No matter what the file encoding is SUPPOSED to be, run it through iconv to make sure it's UTF-8
19
+ #
20
+ # @example
21
+ # iconv -c -t UTF-8//TRANSLIT -f WINDOWS-1252
22
+ def transliterate_whole_file_to_utf8!
23
+ local_copy.in_place :iconv, RemoteTable::EXTERNAL_ENCODING_ICONV, internal_encoding
24
+ # now that we've force-transliterated to UTF-8, act as though this is what the user had specified
25
+ @internal_encoding = RemoteTable::EXTERNAL_ENCODING
26
+ end
27
+
28
+ # No matter what the EOL are SUPPOSED to be, run it through Perl with a regex that will convert all EOLS to \n
29
+ #
30
+ # @example
31
+ # perl -pe 's/\r\n|\n|\r/\n/g'
32
+ def convert_eol_to_unix!
33
+ local_copy.in_place :perl, EOL_TO_UNIX
34
+ end
35
+
36
+ # If the user has specified :skip, use tail
37
+ #
38
+ # @example :skip => 6
39
+ # tail +7
40
+ def skip_rows!
41
+ if skip > 0
42
+ local_copy.in_place :tail, "+#{skip + 1}"
43
+ end
44
+ end
45
+
46
+ # If the user has specified :crop, use a combination of tail and head
47
+ #
48
+ # @example :crop => (184..263)
49
+ # tail +184 | head 80
50
+ def crop_rows!
51
+ if crop
52
+ local_copy.in_place :tail, "+#{crop.first}"
53
+ local_copy.in_place :head, (crop.last - crop.first + 1)
54
+ end
55
+ end
56
+
57
+ # If the user has specified :cut, use cut
58
+ #
59
+ # @example :cut => '13-'
60
+ # cut -c 13-
61
+ def cut_columns!
62
+ if cut
63
+ local_copy.in_place :cut, cut
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,76 @@
1
+ class RemoteTable
2
+ # Mixed in to process XML and XHTML.
3
+ module ProcessedByNokogiri
4
+ WHITESPACE = /\s+/
5
+ SINGLE_SPACE = ' '
6
+ SOFT_HYPHEN = '&shy;'
7
+
8
+ # Yield each row using Nokogiri.
9
+ def _each
10
+ require 'nokogiri'
11
+ require 'cgi'
12
+
13
+ # save this to a local var because we modify it in the loop
14
+ current_headers = headers
15
+
16
+ unless row_css or row_xpath
17
+ raise ::ArgumentError, "[remote_table] Need :row_css or :row_xpath in order to process XML or HTML"
18
+ end
19
+
20
+ delete_harmful!
21
+ transliterate_whole_file_to_utf8!
22
+
23
+ xml = nokogiri_class.parse(unescaped_xml_without_soft_hyphens, nil, RemoteTable::EXTERNAL_ENCODING)
24
+ (row_css ? xml.css(row_css) : xml.xpath(row_xpath)).each do |row|
25
+ some_value_present = false
26
+ values = if column_css
27
+ row.css column_css
28
+ elsif column_xpath
29
+ row.xpath column_xpath
30
+ else
31
+ [row]
32
+ end.map do |cell|
33
+ memo = cell.content.dup
34
+ memo = assume_utf8 memo
35
+ memo.gsub! WHITESPACE, SINGLE_SPACE
36
+ memo.strip!
37
+ if not some_value_present and not keep_blank_rows and memo.present?
38
+ some_value_present = true
39
+ end
40
+ memo
41
+ end
42
+ if current_headers == :first_row
43
+ current_headers = values.select(&:present?)
44
+ next
45
+ end
46
+ if keep_blank_rows or some_value_present
47
+ if not headers
48
+ yield values
49
+ else
50
+ yield zip(current_headers, values)
51
+ end
52
+ end
53
+ end
54
+ ensure
55
+ local_copy.cleanup
56
+ end
57
+
58
+ private
59
+
60
+ # http://snippets.dzone.com/posts/show/406
61
+ def zip(keys, values)
62
+ hash = ::ActiveSupport::OrderedHash.new
63
+ keys.zip(values) { |k,v| hash[k]=v }
64
+ hash
65
+ end
66
+
67
+ # should we be doing this in ruby?
68
+ def unescaped_xml_without_soft_hyphens
69
+ str = ::CGI.unescapeHTML local_copy.encoded_io.read
70
+ local_copy.encoded_io.rewind
71
+ # get rid of MS Office baddies
72
+ str.gsub! SOFT_HYPHEN, ''
73
+ str
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,97 @@
1
+ class RemoteTable
2
+ # Mixed in to process XLS, XLSX, and ODS with the Roo library.
3
+ module ProcessedByRoo
4
+ TAG = /<[^>]+>/
5
+ BLANK = ''
6
+
7
+ # Yield each row using Roo.
8
+ def _each
9
+ # sometimes Roo forgets to require iconv.
10
+ require 'iconv'
11
+ require 'roo'
12
+
13
+ spreadsheet = roo_class.new local_copy.path, nil, :ignore
14
+ if sheet
15
+ spreadsheet.default_sheet = sheet
16
+ end
17
+
18
+ first_row = if crop
19
+ crop.first + 1
20
+ else
21
+ skip + 1
22
+ end
23
+
24
+ last_row = if crop
25
+ crop.last
26
+ else
27
+ spreadsheet.last_row
28
+ end
29
+
30
+ if not headers
31
+
32
+ # create an array to represent this row
33
+ (first_row..last_row).each do |y|
34
+ some_value_present = false
35
+ output = (1..spreadsheet.last_column).map do |x|
36
+ memo = spreadsheet.cell(y, x).to_s.dup
37
+ memo = assume_utf8 memo
38
+ memo.gsub! TAG, BLANK
39
+ memo.strip!
40
+ if not some_value_present and not keep_blank_rows and memo.present?
41
+ some_value_present = true
42
+ end
43
+ memo
44
+ end
45
+ if keep_blank_rows or some_value_present
46
+ yield output
47
+ end
48
+ end
49
+
50
+ else
51
+
52
+ # create a hash to represent this row
53
+ current_headers = ::ActiveSupport::OrderedHash.new
54
+ if headers == :first_row
55
+ (1..spreadsheet.last_column).each do |x|
56
+ v = spreadsheet.cell(first_row, x)
57
+ if v.blank?
58
+ # then look up one
59
+ v = spreadsheet.cell(first_row - 1, x)
60
+ end
61
+ if v.present?
62
+ v = assume_utf8 v
63
+ # 'foobar' is found at column 6
64
+ current_headers[v] = x
65
+ end
66
+ end
67
+ # "advance the cursor"
68
+ first_row += 1
69
+ else
70
+ headers.each_with_index do |k, i|
71
+ current_headers[k] = i + 1
72
+ end
73
+ end
74
+ (first_row..last_row).each do |y|
75
+ some_value_present = false
76
+ output = ::ActiveSupport::OrderedHash.new
77
+ current_headers.each do |k, x|
78
+ memo = spreadsheet.cell(y, x).to_s.dup
79
+ memo = assume_utf8 memo
80
+ memo.gsub! TAG, BLANK
81
+ memo.strip!
82
+ if not some_value_present and not keep_blank_rows and memo.present?
83
+ some_value_present = true
84
+ end
85
+ output[k] = memo
86
+ end
87
+ if keep_blank_rows or some_value_present
88
+ yield output
89
+ end
90
+ end
91
+
92
+ end
93
+ ensure
94
+ local_copy.cleanup
95
+ end
96
+ end
97
+ end