remote_table 1.2.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. data/.gitattributes +1 -0
  2. data/README.rdoc +12 -0
  3. data/lib/remote_table.rb +5 -0
  4. data/lib/remote_table/executor.rb +3 -0
  5. data/lib/remote_table/format.rb +16 -9
  6. data/lib/remote_table/format/delimited.rb +10 -8
  7. data/lib/remote_table/format/fixed_width.rb +12 -5
  8. data/lib/remote_table/format/mixins/processed_by_nokogiri.rb +4 -2
  9. data/lib/remote_table/format/mixins/processed_by_roo.rb +9 -12
  10. data/lib/remote_table/format/mixins/textual.rb +13 -2
  11. data/lib/remote_table/local_file.rb +12 -0
  12. data/lib/remote_table/properties.rb +48 -36
  13. data/lib/remote_table/version.rb +1 -1
  14. data/remote_table.gemspec +2 -4
  15. data/test/helper.rb +16 -1
  16. data/test/support/list-en1-semic-3.neooffice.binary.ods +0 -0
  17. data/test/support/list-en1-semic-3.neooffice.iso-8859-1.csv +0 -0
  18. data/test/support/list-en1-semic-3.neooffice.iso-8859-1.fixed_width-64 +0 -0
  19. data/test/support/list-en1-semic-3.neooffice.utf-8.csv +0 -0
  20. data/test/support/list-en1-semic-3.neooffice.utf-8.fixed_width-62 +0 -0
  21. data/test/support/list-en1-semic-3.neooffice.utf-8.html +0 -0
  22. data/test/support/list-en1-semic-3.neooffice.utf-8.xml +0 -0
  23. data/test/support/list-en1-semic-3.office-2011-for-mac-sp1-excel-95.binary.xls +0 -0
  24. data/test/support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xls +0 -0
  25. data/test/support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xlsx +0 -0
  26. data/test/support/list-en1-semic-3.office-2011-for-mac-sp1.iso-8859-1.html +0 -0
  27. data/test/support/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma +0 -0
  28. data/test/support/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html +0 -0
  29. data/test/support/list-en1-semic-3.original.iso-8859-1.csv +0 -0
  30. data/test/test_old_syntax.rb +1 -1
  31. data/test/test_old_transform.rb +26 -31
  32. data/test/test_remote_table.rb +34 -7
  33. metadata +37 -81
data/.gitattributes ADDED
@@ -0,0 +1 @@
1
+ list-en1-semic-3* -crlf -diff -merge
data/README.rdoc CHANGED
@@ -6,6 +6,13 @@ Open local or remote XLSX, XLS, ODS, CSV and fixed-width files.
6
6
 
7
7
  Used by http://data.brighterplanet.com and the data_miner gem (http://github.com/seamusabshere/data_miner)
8
8
 
9
+ ==Requirements
10
+
11
+ * POSIX operating system (not windows)
12
+ * curl, iconv, perl, cat, cut, tail, etc. accessible from /usr/local/bin:/usr/bin:/bin
13
+
14
+ As this library matures, those should go away.
15
+
9
16
  ==Example
10
17
 
11
18
  ?> t = RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', 'filename' => '98guide6.csv'
@@ -136,6 +143,11 @@ More examples:
136
143
  [ 'spacer', 1 ],
137
144
  [ 'header1', 10, { :type => :string } ]]
138
145
 
146
+ ==Helpful hints
147
+
148
+ * ASCII-8BIT is the same as BINARY
149
+ * ISO-8859-1 is the same as Latin1
150
+
139
151
  ==Custom parsers
140
152
 
141
153
  See the test file and also data_miner examples of custom parsers.
data/lib/remote_table.rb CHANGED
@@ -1,3 +1,8 @@
1
+ if ::RUBY_VERSION < '1.9' and $KCODE != 'UTF8'
2
+ $stderr.puts "[remote_table] Ruby 1.8 detected, setting $KCODE to UTF8 so that ActiveSupport::Multibyte works properly."
3
+ $KCODE = 'UTF8'
4
+ end
5
+
1
6
  require 'active_support'
2
7
  require 'active_support/version'
3
8
  %w{
@@ -16,6 +16,9 @@ class RemoteTable
16
16
 
17
17
  def backtick_with_reporting(cmd, raise_on_error = false)
18
18
  cmd = cmd.gsub /\n/m, ' '
19
+ if ::ENV['REMOTE_TABLE_DEBUG'] and ::ENV['REMOTE_TABLE_DEBUG'].include? 'backtick'
20
+ $stderr.puts "[remote_table] Executing #{cmd}"
21
+ end
19
22
  pid = ::POSIX::Spawn.spawn({ 'PATH' => '/usr/local/bin:/usr/bin:/bin' }, cmd)
20
23
  stat = ::Process::waitpid pid
21
24
  if raise_on_error and not stat.success?
@@ -24,17 +24,24 @@ class RemoteTable
24
24
  @t = t
25
25
  end
26
26
 
27
- def utf8(str)
27
+ def transliterate_to_utf8(str)
28
+ return if str.nil?
29
+ $stderr.puts "[remote_table translit] Before: #{str}" if ::ENV['REMOTE_TABLE_DEBUG'] and ::ENV['REMOTE_TABLE_DEBUG'].include?('translit')
30
+ transliterated_str = if ::RUBY_VERSION >= '1.9'
31
+ str.ensure_encoding t.properties.external_encoding, :external_encoding => t.properties.internal_encoding, :invalid_characters => :transcode
32
+ else
33
+ ::Iconv.conv(t.properties.external_encoding_iconv, t.properties.internal_encoding, str.to_s + ' ')[0..-2]
34
+ end
35
+ $stderr.puts "[remote_table translit] After: #{transliterated_str}" if ::ENV['REMOTE_TABLE_DEBUG'] and ::ENV['REMOTE_TABLE_DEBUG'].include?('translit')
36
+ transliterated_str
37
+ end
38
+
39
+ def assume_utf8(str)
40
+ return if str.nil?
28
41
  if ::RUBY_VERSION >= '1.9'
29
- str.ensure_encoding 'UTF-8', :external_encoding => t.properties.encoding, :invalid_characters => :transcode
42
+ str.encode! t.properties.external_encoding
30
43
  else
31
- return str if t.properties.encoding[0] =~ /utf.?8/i
32
- begin
33
- ::Iconv.conv('UTF-8//TRANSLIT', t.properties.encoding[0], str.to_s + ' ')[0..-2]
34
- rescue ::Iconv::IllegalSequence
35
- $stderr.puts "[remote_table] Unable to transliterate #{str} into UTF-8 given #{t.properties.encoding[0]}"
36
- str
37
- end
44
+ str
38
45
  end
39
46
  end
40
47
 
@@ -17,19 +17,21 @@ class RemoteTable
17
17
  include Textual
18
18
  def each(&blk)
19
19
  remove_useless_characters!
20
+ fix_newlines!
21
+ transliterate_whole_file_to_utf8!
20
22
  skip_rows!
21
- CSV.foreach(t.local_file.path, fastercsv_options) do |row|
23
+ CSV.new(t.local_file.encoded_io, fastercsv_options).each do |row|
22
24
  if row.is_a?(CSV::Row)
23
- output = row.inject(::ActiveSupport::OrderedHash.new) do |memo, (key, value)|
24
- if key.present?
25
- value = '' if value.nil?
26
- memo[key] = utf8 value
25
+ hash = row.inject(::ActiveSupport::OrderedHash.new) do |memo, (k, v)|
26
+ if k.present?
27
+ memo[k] = v.to_s
27
28
  end
28
29
  memo
29
30
  end
30
- yield output if t.properties.keep_blank_rows or output.any? { |k, v| v.present? }
31
- else
32
- yield row if t.properties.keep_blank_rows or row.any? { |v| v.present? }
31
+ yield hash if t.properties.keep_blank_rows or hash.any? { |k, v| v.present? }
32
+ elsif row.is_a?(::Array)
33
+ array = row.map { |v| v.to_s }
34
+ yield array if t.properties.keep_blank_rows or array.any? { |v| v.present? }
33
35
  end
34
36
  end
35
37
  ensure
@@ -1,17 +1,20 @@
1
- require 'slither'
1
+ require 'fixed_width'
2
+
2
3
  class RemoteTable
3
4
  class Format
4
5
  class FixedWidth < Format
5
6
  include Textual
6
7
  def each(&blk)
7
8
  remove_useless_characters!
9
+ fix_newlines!
10
+ transliterate_whole_file_to_utf8!
8
11
  crop_rows!
9
12
  skip_rows!
10
13
  cut_columns!
11
14
  parser.parse[:rows].each do |row|
12
15
  row.reject! { |k, v| k.blank? }
13
16
  row.each do |k, v|
14
- row[k] = utf8 v
17
+ row[k] = v.strip
15
18
  end
16
19
  yield row if t.properties.keep_blank_rows or row.any? { |k, v| v.present? }
17
20
  end
@@ -22,16 +25,20 @@ class RemoteTable
22
25
  private
23
26
 
24
27
  def parser
25
- @parser ||= ::Slither::Parser.new definition, t.local_file.path
28
+ return @parser if @parser.is_a?(::FixedWidth::Parser)
29
+ if ::FixedWidth::Section.private_instance_methods.map(&:to_sym).include?(:unpacker)
30
+ raise "[remote_table] You need a different (newer) version of the FixedWidth gem that supports multibyte encoding, sometime after https://github.com/timonk/fixed_width/pull/1 was incorporated"
31
+ end
32
+ @parser = ::FixedWidth::Parser.new definition, t.local_file.encoded_io
26
33
  end
27
34
 
28
35
  def definition
29
36
  @definition ||= if t.properties.schema_name.is_a?(::String) or t.properties.schema_name.is_a?(::Symbol)
30
- ::Slither.send :definition, t.properties.schema_name
37
+ ::FixedWidth.send :definition, t.properties.schema_name
31
38
  elsif t.properties.schema.is_a?(::Array)
32
39
  everything = lambda { |_| true }
33
40
  srand # in case this was forked by resque
34
- ::Slither.define(rand.to_s) do |d|
41
+ ::FixedWidth.define(rand.to_s) do |d|
35
42
  d.rows do |row|
36
43
  row.trap(&everything)
37
44
  t.properties.schema.each do |name, width, options|
@@ -4,7 +4,9 @@ class RemoteTable
4
4
  class Format
5
5
  module ProcessedByNokogiri
6
6
  def each
7
+ raise "[remote_table] Need :row_css or :row_xpath in order to process XML or HTML" unless t.properties.row_css or t.properties.row_xpath
7
8
  remove_useless_characters!
9
+ transliterate_whole_file_to_utf8!
8
10
  first_row = true
9
11
  keys = t.properties.headers if t.properties.headers.is_a?(::Array)
10
12
  xml = nokogiri_class.parse(unescaped_xml_without_soft_hyphens, nil, 'UTF-8')
@@ -15,7 +17,7 @@ class RemoteTable
15
17
  row.xpath(t.properties.column_xpath)
16
18
  else
17
19
  [row]
18
- end.map { |cell| cell.content.gsub(/\s+/, ' ').strip }
20
+ end.map { |cell| assume_utf8 cell.content.gsub(/\s+/, ' ').strip }
19
21
  if first_row and t.properties.use_first_row_as_header?
20
22
  keys = values
21
23
  first_row = false
@@ -57,7 +59,7 @@ class RemoteTable
57
59
 
58
60
  # should we be doing this in ruby?
59
61
  def unescaped_xml_without_soft_hyphens
60
- str = ::CGI.unescapeHTML utf8(::IO.read(t.local_file.path))
62
+ str = ::CGI.unescapeHTML t.local_file.encoded_io.read
61
63
  # get rid of MS Office baddies
62
64
  str.gsub! '&shy;', ''
63
65
  str
@@ -6,9 +6,9 @@ class RemoteTable
6
6
  spreadsheet = roo_class.new t.local_file.path, nil, :ignore
7
7
  spreadsheet.default_sheet = t.properties.sheet.is_a?(::Numeric) ? spreadsheet.sheets[t.properties.sheet] : t.properties.sheet
8
8
  if t.properties.output_class == ::Array
9
- (first_data_row..spreadsheet.last_row).each do |y|
9
+ (first_row..spreadsheet.last_row).each do |y|
10
10
  output = (1..spreadsheet.last_column).map do |x|
11
- spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
11
+ assume_utf8 spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
12
12
  end
13
13
  yield output if t.properties.keep_blank_rows or output.any? { |v| v.present? }
14
14
  end
@@ -16,18 +16,19 @@ class RemoteTable
16
16
  keys = {}
17
17
  if t.properties.use_first_row_as_header?
18
18
  (1..spreadsheet.last_column).each do |x|
19
- keys[x] = spreadsheet.cell(header_row, x)
20
- keys[x] = spreadsheet.cell(header_row - 1, x) if keys[x].blank? # look up
19
+ keys[x] = spreadsheet.cell(first_row, x)
20
+ keys[x] = spreadsheet.cell(first_row - 1, x) if keys[x].blank? # look up
21
+ keys[x] = assume_utf8 keys[x]
21
22
  end
22
23
  else
23
24
  (1..spreadsheet.last_column).each do |x|
24
- keys[x] = t.properties.headers[x - 1]
25
+ keys[x] = assume_utf8 t.properties.headers[x - 1]
25
26
  end
26
27
  end
27
- (first_data_row..spreadsheet.last_row).each do |y|
28
+ (first_row+1..spreadsheet.last_row).each do |y|
28
29
  output = (1..spreadsheet.last_column).inject(::ActiveSupport::OrderedHash.new) do |memo, x|
29
30
  if keys[x].present?
30
- memo[keys[x]] = spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
31
+ memo[keys[x]] = assume_utf8 spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
31
32
  end
32
33
  memo
33
34
  end
@@ -40,13 +41,9 @@ class RemoteTable
40
41
 
41
42
  private
42
43
 
43
- def header_row
44
+ def first_row
44
45
  1 + t.properties.skip
45
46
  end
46
-
47
- def first_data_row
48
- 1 + header_row
49
- end
50
47
  end
51
48
  end
52
49
  end
@@ -6,11 +6,22 @@ class RemoteTable
6
6
  USELESS_CHARACTERS = [
7
7
  '\xef\xbb\xbf', # UTF-8 byte order mark
8
8
  '\xc2\xad', # soft hyphen, often inserted by MS Office (html: &shy;)
9
- '\xad',
10
- # '\xa0'
11
9
  ]
12
10
  def remove_useless_characters!
13
11
  ::RemoteTable.executor.bang t.local_file.path, "perl -pe 's/#{USELESS_CHARACTERS.join '//g; s/'}//g'"
12
+ if t.properties.internal_encoding =~ /windows.?1252/i
13
+ # soft hyphen again, as I have seen it appear in windows 1252
14
+ ::RemoteTable.executor.bang t.local_file.path, %q{perl -pe 's/\xad//g'}
15
+ end
16
+ end
17
+
18
+ def transliterate_whole_file_to_utf8!
19
+ ::RemoteTable.executor.bang t.local_file.path, "iconv -c -f #{::Escape.shell_single_word t.properties.internal_encoding} -t #{::Escape.shell_single_word t.properties.external_encoding_iconv}"
20
+ t.properties.update 'encoding' => t.properties.external_encoding
21
+ end
22
+
23
+ def fix_newlines!
24
+ ::RemoteTable.executor.bang t.local_file.path, %q{perl -pe 's/\r\n|\n|\r/\n/g'}
14
25
  end
15
26
 
16
27
  def skip_rows!
@@ -15,8 +15,20 @@ class RemoteTable
15
15
  @path
16
16
  end
17
17
 
18
+ def encoded_io
19
+ @encoded_io ||= if ::RUBY_VERSION >= '1.9'
20
+ ::File.open path, 'rb', :internal_encoding => t.properties.internal_encoding, :external_encoding => t.properties.external_encoding
21
+ else
22
+ ::File.open path, 'rb'
23
+ end
24
+ end
25
+
18
26
  def delete
27
+ if @encoded_io.respond_to?(:closed?) and !@encoded_io.closed?
28
+ @encoded_io.close
29
+ end
19
30
  ::FileUtils.rm_rf staging_dir_path
31
+ @encoded_io = nil
20
32
  @path = nil
21
33
  @staging_dir_path = nil
22
34
  end
@@ -3,8 +3,15 @@ class RemoteTable
3
3
  # Represents the properties of a RemoteTable, whether they are explicitly set by the user or inferred automatically.
4
4
  class Properties
5
5
  attr_reader :t
6
+ attr_reader :current_options
7
+
6
8
  def initialize(t)
7
9
  @t = t
10
+ @current_options = t.options.dup
11
+ end
12
+
13
+ def update(options)
14
+ current_options.update options
8
15
  end
9
16
 
10
17
  # The parsed URI of the file to get.
@@ -22,19 +29,19 @@ class RemoteTable
22
29
  # * call each
23
30
  # Defaults to false.
24
31
  def streaming
25
- t.options['streaming'] || false
32
+ current_options['streaming'] || false
26
33
  end
27
34
 
28
35
  # Defaults to true.
29
36
  def warn_on_multiple_downloads
30
- t.options['warn_on_multiple_downloads'] != false
37
+ current_options['warn_on_multiple_downloads'] != false
31
38
  end
32
39
 
33
40
  # The headers specified by the user
34
41
  #
35
42
  # Default: :first_row
36
43
  def headers
37
- t.options['headers'].nil? ? :first_row : t.options['headers']
44
+ current_options['headers'].nil? ? :first_row : current_options['headers']
38
45
  end
39
46
 
40
47
  def use_first_row_as_header?
@@ -49,60 +56,65 @@ class RemoteTable
49
56
  #
50
57
  # Default: 0
51
58
  def sheet
52
- t.options['sheet'] || 0
59
+ current_options['sheet'] || 0
53
60
  end
54
61
 
55
62
  # Whether to keep blank rows
56
63
  #
57
64
  # Default: false
58
65
  def keep_blank_rows
59
- t.options['keep_blank_rows'] || false
66
+ current_options['keep_blank_rows'] || false
60
67
  end
61
68
 
62
69
  # Form data to send in with the download request
63
70
  def form_data
64
- t.options['form_data']
71
+ current_options['form_data']
65
72
  end
66
73
 
67
74
  # How many rows to skip
68
75
  #
69
76
  # Default: 0
70
77
  def skip
71
- t.options['skip'].to_i
78
+ current_options['skip'].to_i
72
79
  end
73
80
 
74
- # Likely external encoding
75
- #
76
- # Default: "UTF-8"
77
- def encoding
78
- @encoding ||= ::Array.wrap(t.options['encoding'] || [ 'ISO-8859-1', 'US-ASCII', 'WINDOWS-1252', 'ASCII-8BIT', 'UTF-8' ])
81
+ def internal_encoding
82
+ (current_options['encoding'] || 'UTF-8').upcase
83
+ end
84
+
85
+ def external_encoding
86
+ 'UTF-8'
87
+ end
88
+
89
+ def external_encoding_iconv
90
+ 'UTF-8//TRANSLIT'
79
91
  end
80
92
 
81
93
  # The delimiter
82
94
  #
83
95
  # Default: ","
84
96
  def delimiter
85
- t.options['delimiter'] || ','
97
+ current_options['delimiter'] || ','
86
98
  end
87
99
 
88
100
  # The XPath used to find rows
89
101
  def row_xpath
90
- t.options['row_xpath']
102
+ current_options['row_xpath']
91
103
  end
92
104
 
93
105
  # The XPath used to find columns
94
106
  def column_xpath
95
- t.options['column_xpath']
107
+ current_options['column_xpath']
96
108
  end
97
109
 
98
110
  # The CSS selector used to find rows
99
111
  def row_css
100
- t.options['row_css']
112
+ current_options['row_css']
101
113
  end
102
114
 
103
115
  # The CSS selector used to find columns
104
116
  def column_css
105
- t.options['column_css']
117
+ current_options['column_css']
106
118
  end
107
119
 
108
120
  # The compression type.
@@ -111,8 +123,8 @@ class RemoteTable
111
123
  #
112
124
  # Can be specified as: "gz", "zip", "bz2", "exe" (treated as "zip")
113
125
  def compression
114
- clue = if t.options['compression']
115
- t.options['compression'].to_s
126
+ clue = if current_options['compression']
127
+ current_options['compression'].to_s
116
128
  else
117
129
  ::File.extname uri.path
118
130
  end
@@ -134,8 +146,8 @@ class RemoteTable
134
146
  #
135
147
  # Can be specified as: "tar"
136
148
  def packing
137
- clue = if t.options['packing']
138
- t.options['packing'].to_s
149
+ clue = if current_options['packing']
150
+ current_options['packing'].to_s
139
151
  else
140
152
  ::File.extname(uri.path.sub(/\.#{compression}\z/, ''))
141
153
  end
@@ -150,7 +162,7 @@ class RemoteTable
150
162
  # Example:
151
163
  # RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', 'glob' => '/*.csv'
152
164
  def glob
153
- t.options['glob']
165
+ current_options['glob']
154
166
  end
155
167
 
156
168
  # The filename, which can be used to pick a file out of an archive.
@@ -158,17 +170,17 @@ class RemoteTable
158
170
  # Example:
159
171
  # RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', 'filename' => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv'
160
172
  def filename
161
- t.options['filename']
173
+ current_options['filename']
162
174
  end
163
175
 
164
176
  # Cut columns up to this character
165
177
  def cut
166
- t.options['cut']
178
+ current_options['cut']
167
179
  end
168
180
 
169
181
  # Crop rows after this line
170
182
  def crop
171
- t.options['crop']
183
+ current_options['crop']
172
184
  end
173
185
 
174
186
  # The fixed-width schema, given as an array
@@ -183,31 +195,31 @@ class RemoteTable
183
195
  # [ 'spacer', 12 ],
184
196
  # [ 'header6', 10, { :type => :string } ]])
185
197
  def schema
186
- t.options['schema']
198
+ current_options['schema']
187
199
  end
188
200
 
189
- # The name of the fixed-width schema according to Slither
201
+ # The name of the fixed-width schema according to FixedWidth
190
202
  def schema_name
191
- t.options['schema_name']
203
+ current_options['schema_name']
192
204
  end
193
205
 
194
206
  # A proc to call to decide whether to return a row.
195
207
  def select
196
- t.options['select']
208
+ current_options['select']
197
209
  end
198
210
 
199
211
  # A proc to call to decide whether to return a row.
200
212
  def reject
201
- t.options['reject']
213
+ current_options['reject']
202
214
  end
203
215
 
204
216
  # A hash of options to create a new Errata instance (see the Errata gem at http://github.com/seamusabshere/errata) to be used on every row.
205
217
  def errata
206
- return unless t.options.has_key? 'errata'
207
- @errata ||= if t.options['errata'].is_a? ::Hash
208
- ::Errata.new t.options['errata']
218
+ return unless current_options.has_key? 'errata'
219
+ @errata ||= if current_options['errata'].is_a? ::Hash
220
+ ::Errata.new current_options['errata']
209
221
  else
210
- t.options['errata']
222
+ current_options['errata']
211
223
  end
212
224
  end
213
225
 
@@ -220,8 +232,8 @@ class RemoteTable
220
232
  # Can be specified as: "xlsx", "xls", "csv", "ods", "fixed_width", "html"
221
233
  def format
222
234
  return Format::Delimited if uri.host == 'spreadsheets.google.com'
223
- clue = if t.options['format']
224
- t.options['format'].to_s
235
+ clue = if current_options['format']
236
+ current_options['format'].to_s
225
237
  else
226
238
  ::File.extname t.local_file.path
227
239
  end
@@ -1,3 +1,3 @@
1
1
  class RemoteTable
2
- VERSION = "1.2.2"
2
+ VERSION = "1.2.3"
3
3
  end
data/remote_table.gemspec CHANGED
@@ -21,7 +21,7 @@ Gem::Specification.new do |s|
21
21
 
22
22
  s.add_dependency 'activesupport', '>=2.3.4'
23
23
  s.add_dependency 'roo', '~>1.9'
24
- s.add_dependency 'slither', '>=0.99.4'
24
+ s.add_dependency 'fixed_width-multibyte' # TODO replace with fixed_width once timon gets off vacation
25
25
  s.add_dependency 'i18n' # activesupport?
26
26
  s.add_dependency 'builder' # roo?
27
27
  s.add_dependency 'zip' # roo
@@ -31,9 +31,7 @@ Gem::Specification.new do |s|
31
31
  s.add_dependency 'escape', '>=0.0.4'
32
32
  s.add_dependency 'posix-spawn'
33
33
  s.add_dependency 'ensure-encoding'
34
- unless RUBY_VERSION >= '1.9'
35
- s.add_dependency 'fastercsv', '>=1.5.0'
36
- end
34
+ s.add_dependency 'fastercsv', '>=1.5.0'
37
35
 
38
36
  s.add_development_dependency 'errata', '>=0.2.0'
39
37
  s.add_development_dependency 'test-unit'
data/test/helper.rb CHANGED
@@ -4,11 +4,26 @@ Bundler.setup
4
4
  require 'test/unit'
5
5
  require 'shoulda'
6
6
  require 'ruby-debug'
7
- require 'tempfile'
8
7
 
9
8
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
10
9
  $LOAD_PATH.unshift(File.dirname(__FILE__))
11
10
  require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'remote_table'))
12
11
 
13
12
  class Test::Unit::TestCase
13
+ def setup
14
+ if RUBY_VERSION >= '1.9'
15
+ @old_default_internal = Encoding.default_internal
16
+ @old_default_external = Encoding.default_external
17
+ # totally random choices here
18
+ Encoding.default_internal = 'EUC-JP'
19
+ Encoding.default_external = 'Shift_JIS'
20
+ end
21
+ end
22
+
23
+ def teardown
24
+ if RUBY_VERSION >= '1.9'
25
+ Encoding.default_internal = @old_default_internal
26
+ Encoding.default_external = @old_default_external
27
+ end
28
+ end
14
29
  end
@@ -16,7 +16,7 @@ $test2_rows.freeze
16
16
  class TestOldSyntax < Test::Unit::TestCase
17
17
  should "open an XLSX like an array (numbered columns)" do
18
18
  t = RemoteTable.new(:url => 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :headers => false)
19
- assert_equal "Secure encryption of all data", t.rows[5][0]
19
+ assert_equal "Software-As-A-Service", t.rows[5][0]
20
20
  end
21
21
 
22
22
  should "open an XLSX with custom headers" do
@@ -1,36 +1,31 @@
1
1
  require 'helper'
2
2
 
3
- class FuelOilParser
3
+ class NaturalGasParser
4
4
  def initialize(options = {})
5
5
  # nothing
6
6
  end
7
- def add_hints!(bus)
8
- bus[:sheet] = 'Data 1'
9
- bus[:skip] = 2
10
- bus[:select] = lambda { |row| row['year'] > 1989 }
11
- end
12
7
  def apply(row)
13
8
  virtual_rows = []
14
- row.keys.grep(/(.+) Residual Fuel Oil/) do |location_column_name|
15
- first_part = $1
16
- next if (cost = row[location_column_name]).blank? or (date = row['Date']).blank?
17
- if first_part.start_with?('U.S.')
18
- locatable = "united_states (Country)"
19
- elsif first_part.include?('PADD')
20
- /\(PADD (.*)\)/.match(first_part)
21
- padd_part = $1
22
- next if padd_part == '1' # skip PADD 1 because we always prefer subdistricts
23
- locatable = "#{padd_part} (PetroleumAdministrationForDefenseDistrict)"
9
+ row.keys.grep(/\A(.*) Natural Gas/) do |location_column_name|
10
+ match_1 = $1
11
+ next if (price = row[location_column_name]).blank? or (date = row['Date']).blank?
12
+ if match_1 == 'U.S.'
13
+ locatable_id = 'US'
14
+ locatable_type = 'Country'
24
15
  else
25
- locatable = "#{first_part} (State)"
16
+ locatable_id = match_1 # name
17
+ locatable_type = 'State'
26
18
  end
27
19
  date = Time.parse(date)
28
- virtual_rows << {
29
- 'locatable' => locatable,
30
- 'cost' => cost,
31
- 'year' => date.year,
32
- 'month' => date.month
33
- }
20
+ new_row = ActiveSupport::OrderedHash.new
21
+ new_row['locatable_id'] = locatable_id
22
+ new_row['locatable_type'] = locatable_type
23
+ new_row['price'] = price
24
+ new_row['year'] = date.year
25
+ new_row['month'] = date.month
26
+ row_hash = RemoteTable::Transform.row_hash new_row
27
+ new_row['row_hash'] = row_hash
28
+ virtual_rows << new_row
34
29
  end
35
30
  virtual_rows
36
31
  end
@@ -38,12 +33,12 @@ end
38
33
 
39
34
  class TestOldTransform < Test::Unit::TestCase
40
35
  should "open an XLS with a parser" do
41
- ma_1990_01 = {"month"=>1, "cost"=>"54.0", "locatable"=>"Massachusetts (State)", "year"=>1990}
42
- ga_1990_01 = {"month"=>1, "cost"=>"50.7", "locatable"=>"Georgia (State)", "year"=>1990}
43
-
44
- t = RemoteTable.new(:url => 'http://tonto.eia.doe.gov/dnav/pet/xls/PET_PRI_RESID_A_EPPR_PTA_CPGAL_M.xls',
45
- :transform => { :class => FuelOilParser })
46
- assert t.rows.include?(ma_1990_01)
47
- assert t.rows.include?(ga_1990_01)
36
+ t = RemoteTable.new(:url => 'http://tonto.eia.doe.gov/dnav/ng/xls/ng_pri_sum_a_EPG0_FWA_DMcf_a.xls',
37
+ :sheet => 'Data 1',
38
+ :skip => 2,
39
+ :select => lambda { |row| row['year'].to_i > 1989 },
40
+ :transform => { :class => NaturalGasParser })
41
+ assert_equal 'Country', t[0]['locatable_type']
42
+ assert_equal 'US', t[0]['locatable_id']
48
43
  end
49
- end
44
+ end
@@ -63,12 +63,12 @@ class TestRemoteTable < Test::Unit::TestCase
63
63
 
64
64
  # fixes ArgumentError: invalid byte sequence in UTF-8
65
65
  should %{safely strip soft hyphens and read windows-1252 html} do
66
- t = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-A.htm", :row_xpath => '//table/tr[2]/td/table/tr', :column_xpath => 'td'
66
+ t = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-A.htm", :row_xpath => '//table/tr[2]/td/table/tr', :column_xpath => 'td', :encoding => 'windows-1252'
67
67
  assert t.rows.detect { |row| row['Model'] == 'A300B4600' }
68
68
  end
69
69
 
70
70
  should %{transliterate characters from ISO-8859-1} do
71
- t = RemoteTable.new :url => 'http://static.brighterplanet.com/science/data/consumables/pets/breed_genders.csv'
71
+ t = RemoteTable.new :url => 'http://static.brighterplanet.com/science/data/consumables/pets/breed_genders.csv', :encoding => 'ISO-8859-1'
72
72
  assert t.rows.detect { |row| row['name'] == 'Briquet Griffon Vendéen' }
73
73
  end
74
74
 
@@ -86,15 +86,42 @@ class TestRemoteTable < Test::Unit::TestCase
86
86
  assert(time1 != time2)
87
87
  end
88
88
 
89
- should %{not die when it reads Åland Islands} do
90
- t = RemoteTable.new 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';'
91
- assert_nothing_raised do
92
- t[1][0]
89
+ {
90
+ # IMPOSSIBLE "../support/list-en1-semic-3.office-2011-for-mac-sp1-excel-95.binary.xls" => {:format=>"xls", :encoding=>"binary"},
91
+ "../support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xlsx" => {:format=>"xlsx"},
92
+ "../support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xls" => {:format=>"xls"},
93
+ "../support/list-en1-semic-3.neooffice.binary.ods" => {:format=>"ods"},
94
+ "../support/list-en1-semic-3.neooffice.iso-8859-1.fixed_width-64" => {:format=>"fixed_width", :encoding=>"iso-8859-1", :schema => [['name', 63, { :type => :string }], ['iso_3166', 2, { :type => :string }]]},
95
+ "../support/list-en1-semic-3.neooffice.utf-8.fixed_width-62" => {:format=>"fixed_width", :schema => [['name', 61, { :type => :string }], ['iso_3166', 2, { :type => :string }]]},
96
+ # TODO "../support/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html" => {:format=>"html" },
97
+ # TODO "../support/list-en1-semic-3.office-2011-for-mac-sp1.iso-8859-1.html" => {:format=>"html", :encoding=>"iso-8859-1"},
98
+ # TODO "../support/list-en1-semic-3.neooffice.utf-8.html" => {:format=>"html" },
99
+ "../support/list-en1-semic-3.neooffice.utf-8.xml" => {:format=>"xml", :row_css=>'Row', :column_css => 'Data', :select => lambda { |row| row[1].to_s =~ /[A-Z]{2}/ }},
100
+ "../support/list-en1-semic-3.neooffice.iso-8859-1.csv" => {:format=>"csv", :encoding=>"iso-8859-1", :delimiter => ';'},
101
+ "../support/list-en1-semic-3.original.iso-8859-1.csv" => {:format=>"csv", :encoding=>"iso-8859-1", :delimiter => ';'},
102
+ "../support/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma" => {:format=>"csv", :encoding=>"MACROMAN"}, # comma because no option in excel
103
+ "../support/list-en1-semic-3.neooffice.utf-8.csv" => {:format=>"csv", :delimiter => ';'}
104
+ }.each do |k, v|
105
+ should %{open #{k} with encoding #{v[:encoding] || 'default'}} do
106
+ options = v.merge(:headers => false, :skip => 2)
107
+ t = RemoteTable.new "file://#{File.expand_path(k, __FILE__)}", options
108
+ a = %{ÅLAND ISLANDS}
109
+ b = (t[1].is_a?(::Array) ? t[1][0] : t[1]['name'])
110
+ if RUBY_VERSION >= '1.9'
111
+ assert_equal 'UTF-8', a.encoding.to_s
112
+ assert_equal 'UTF-8', b.encoding.to_s
113
+ end
114
+ assert_equal a, b
93
115
  end
94
116
  end
95
117
 
118
+ should %{recode as UTF-8 even ISO-8859-1 (or any other encoding)} do
119
+ t = RemoteTable.new 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';', :encoding => 'ISO-8859-1'
120
+ assert_equal %{ÅLAND ISLANDS}, t[1][0]
121
+ end
122
+
96
123
  should %{parse a big CSV that is not UTF-8} do
97
- t = RemoteTable.new 'https://openflights.svn.sourceforge.net/svnroot/openflights/openflights/data/airports.dat', :headers => false
124
+ t = RemoteTable.new 'https://openflights.svn.sourceforge.net/svnroot/openflights/openflights/data/airports.dat', :headers => false#, :encoding => 'UTF-8'
98
125
  assert_equal 'Goroka', t[0][1]
99
126
  end
100
127
  end
metadata CHANGED
@@ -1,13 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remote_table
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
5
4
  prerelease:
6
- segments:
7
- - 1
8
- - 2
9
- - 2
10
- version: 1.2.2
5
+ version: 1.2.3
11
6
  platform: ruby
12
7
  authors:
13
8
  - Seamus Abshere
@@ -16,7 +11,8 @@ autorequire:
16
11
  bindir: bin
17
12
  cert_chain: []
18
13
 
19
- date: 2011-05-05 00:00:00 Z
14
+ date: 2011-05-21 00:00:00 -05:00
15
+ default_executable:
20
16
  dependencies:
21
17
  - !ruby/object:Gem::Dependency
22
18
  name: activesupport
@@ -26,11 +22,6 @@ dependencies:
26
22
  requirements:
27
23
  - - ">="
28
24
  - !ruby/object:Gem::Version
29
- hash: 11
30
- segments:
31
- - 2
32
- - 3
33
- - 4
34
25
  version: 2.3.4
35
26
  type: :runtime
36
27
  version_requirements: *id001
@@ -42,27 +33,18 @@ dependencies:
42
33
  requirements:
43
34
  - - ~>
44
35
  - !ruby/object:Gem::Version
45
- hash: 29
46
- segments:
47
- - 1
48
- - 9
49
36
  version: "1.9"
50
37
  type: :runtime
51
38
  version_requirements: *id002
52
39
  - !ruby/object:Gem::Dependency
53
- name: slither
40
+ name: fixed_width-multibyte
54
41
  prerelease: false
55
42
  requirement: &id003 !ruby/object:Gem::Requirement
56
43
  none: false
57
44
  requirements:
58
45
  - - ">="
59
46
  - !ruby/object:Gem::Version
60
- hash: 411
61
- segments:
62
- - 0
63
- - 99
64
- - 4
65
- version: 0.99.4
47
+ version: "0"
66
48
  type: :runtime
67
49
  version_requirements: *id003
68
50
  - !ruby/object:Gem::Dependency
@@ -73,9 +55,6 @@ dependencies:
73
55
  requirements:
74
56
  - - ">="
75
57
  - !ruby/object:Gem::Version
76
- hash: 3
77
- segments:
78
- - 0
79
58
  version: "0"
80
59
  type: :runtime
81
60
  version_requirements: *id004
@@ -87,9 +66,6 @@ dependencies:
87
66
  requirements:
88
67
  - - ">="
89
68
  - !ruby/object:Gem::Version
90
- hash: 3
91
- segments:
92
- - 0
93
69
  version: "0"
94
70
  type: :runtime
95
71
  version_requirements: *id005
@@ -101,9 +77,6 @@ dependencies:
101
77
  requirements:
102
78
  - - ">="
103
79
  - !ruby/object:Gem::Version
104
- hash: 3
105
- segments:
106
- - 0
107
80
  version: "0"
108
81
  type: :runtime
109
82
  version_requirements: *id006
@@ -115,11 +88,6 @@ dependencies:
115
88
  requirements:
116
89
  - - ">="
117
90
  - !ruby/object:Gem::Version
118
- hash: 5
119
- segments:
120
- - 1
121
- - 4
122
- - 1
123
91
  version: 1.4.1
124
92
  type: :runtime
125
93
  version_requirements: *id007
@@ -131,9 +99,6 @@ dependencies:
131
99
  requirements:
132
100
  - - ">="
133
101
  - !ruby/object:Gem::Version
134
- hash: 3
135
- segments:
136
- - 0
137
102
  version: "0"
138
103
  type: :runtime
139
104
  version_requirements: *id008
@@ -145,9 +110,6 @@ dependencies:
145
110
  requirements:
146
111
  - - ">="
147
112
  - !ruby/object:Gem::Version
148
- hash: 3
149
- segments:
150
- - 0
151
113
  version: "0"
152
114
  type: :runtime
153
115
  version_requirements: *id009
@@ -159,11 +121,6 @@ dependencies:
159
121
  requirements:
160
122
  - - ">="
161
123
  - !ruby/object:Gem::Version
162
- hash: 23
163
- segments:
164
- - 0
165
- - 0
166
- - 4
167
124
  version: 0.0.4
168
125
  type: :runtime
169
126
  version_requirements: *id010
@@ -175,9 +132,6 @@ dependencies:
175
132
  requirements:
176
133
  - - ">="
177
134
  - !ruby/object:Gem::Version
178
- hash: 3
179
- segments:
180
- - 0
181
135
  version: "0"
182
136
  type: :runtime
183
137
  version_requirements: *id011
@@ -189,9 +143,6 @@ dependencies:
189
143
  requirements:
190
144
  - - ">="
191
145
  - !ruby/object:Gem::Version
192
- hash: 3
193
- segments:
194
- - 0
195
146
  version: "0"
196
147
  type: :runtime
197
148
  version_requirements: *id012
@@ -203,11 +154,6 @@ dependencies:
203
154
  requirements:
204
155
  - - ">="
205
156
  - !ruby/object:Gem::Version
206
- hash: 3
207
- segments:
208
- - 1
209
- - 5
210
- - 0
211
157
  version: 1.5.0
212
158
  type: :runtime
213
159
  version_requirements: *id013
@@ -219,11 +165,6 @@ dependencies:
219
165
  requirements:
220
166
  - - ">="
221
167
  - !ruby/object:Gem::Version
222
- hash: 23
223
- segments:
224
- - 0
225
- - 2
226
- - 0
227
168
  version: 0.2.0
228
169
  type: :development
229
170
  version_requirements: *id014
@@ -235,9 +176,6 @@ dependencies:
235
176
  requirements:
236
177
  - - ">="
237
178
  - !ruby/object:Gem::Version
238
- hash: 3
239
- segments:
240
- - 0
241
179
  version: "0"
242
180
  type: :development
243
181
  version_requirements: *id015
@@ -249,23 +187,17 @@ dependencies:
249
187
  requirements:
250
188
  - - ">="
251
189
  - !ruby/object:Gem::Version
252
- hash: 3
253
- segments:
254
- - 0
255
190
  version: "0"
256
191
  type: :development
257
192
  version_requirements: *id016
258
193
  - !ruby/object:Gem::Dependency
259
- name: ruby-debug
194
+ name: ruby-debug19
260
195
  prerelease: false
261
196
  requirement: &id017 !ruby/object:Gem::Requirement
262
197
  none: false
263
198
  requirements:
264
199
  - - ">="
265
200
  - !ruby/object:Gem::Version
266
- hash: 3
267
- segments:
268
- - 0
269
201
  version: "0"
270
202
  type: :development
271
203
  version_requirements: *id017
@@ -280,6 +212,7 @@ extra_rdoc_files: []
280
212
 
281
213
  files:
282
214
  - .document
215
+ - .gitattributes
283
216
  - .gitignore
284
217
  - Gemfile
285
218
  - LICENSE
@@ -305,11 +238,26 @@ files:
305
238
  - lib/remote_table/version.rb
306
239
  - remote_table.gemspec
307
240
  - test/helper.rb
241
+ - test/support/list-en1-semic-3.neooffice.binary.ods
242
+ - test/support/list-en1-semic-3.neooffice.iso-8859-1.csv
243
+ - test/support/list-en1-semic-3.neooffice.iso-8859-1.fixed_width-64
244
+ - test/support/list-en1-semic-3.neooffice.utf-8.csv
245
+ - test/support/list-en1-semic-3.neooffice.utf-8.fixed_width-62
246
+ - test/support/list-en1-semic-3.neooffice.utf-8.html
247
+ - test/support/list-en1-semic-3.neooffice.utf-8.xml
248
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1-excel-95.binary.xls
249
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xls
250
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xlsx
251
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.iso-8859-1.html
252
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma
253
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html
254
+ - test/support/list-en1-semic-3.original.iso-8859-1.csv
308
255
  - test/test_big.rb
309
256
  - test/test_errata.rb
310
257
  - test/test_old_syntax.rb
311
258
  - test/test_old_transform.rb
312
259
  - test/test_remote_table.rb
260
+ has_rdoc: true
313
261
  homepage: https://github.com/seamusabshere/remote_table
314
262
  licenses: []
315
263
 
@@ -323,28 +271,36 @@ required_ruby_version: !ruby/object:Gem::Requirement
323
271
  requirements:
324
272
  - - ">="
325
273
  - !ruby/object:Gem::Version
326
- hash: 3
327
- segments:
328
- - 0
329
274
  version: "0"
330
275
  required_rubygems_version: !ruby/object:Gem::Requirement
331
276
  none: false
332
277
  requirements:
333
278
  - - ">="
334
279
  - !ruby/object:Gem::Version
335
- hash: 3
336
- segments:
337
- - 0
338
280
  version: "0"
339
281
  requirements: []
340
282
 
341
283
  rubyforge_project: remotetable
342
- rubygems_version: 1.7.2
284
+ rubygems_version: 1.6.2
343
285
  signing_key:
344
286
  specification_version: 3
345
287
  summary: Open local or remote XLSX, XLS, ODS, CSV and fixed-width files.
346
288
  test_files:
347
289
  - test/helper.rb
290
+ - test/support/list-en1-semic-3.neooffice.binary.ods
291
+ - test/support/list-en1-semic-3.neooffice.iso-8859-1.csv
292
+ - test/support/list-en1-semic-3.neooffice.iso-8859-1.fixed_width-64
293
+ - test/support/list-en1-semic-3.neooffice.utf-8.csv
294
+ - test/support/list-en1-semic-3.neooffice.utf-8.fixed_width-62
295
+ - test/support/list-en1-semic-3.neooffice.utf-8.html
296
+ - test/support/list-en1-semic-3.neooffice.utf-8.xml
297
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1-excel-95.binary.xls
298
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xls
299
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xlsx
300
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.iso-8859-1.html
301
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma
302
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html
303
+ - test/support/list-en1-semic-3.original.iso-8859-1.csv
348
304
  - test/test_big.rb
349
305
  - test/test_errata.rb
350
306
  - test/test_old_syntax.rb