remote_table 1.2.2 → 1.2.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. data/.gitattributes +1 -0
  2. data/README.rdoc +12 -0
  3. data/lib/remote_table.rb +5 -0
  4. data/lib/remote_table/executor.rb +3 -0
  5. data/lib/remote_table/format.rb +16 -9
  6. data/lib/remote_table/format/delimited.rb +10 -8
  7. data/lib/remote_table/format/fixed_width.rb +12 -5
  8. data/lib/remote_table/format/mixins/processed_by_nokogiri.rb +4 -2
  9. data/lib/remote_table/format/mixins/processed_by_roo.rb +9 -12
  10. data/lib/remote_table/format/mixins/textual.rb +13 -2
  11. data/lib/remote_table/local_file.rb +12 -0
  12. data/lib/remote_table/properties.rb +48 -36
  13. data/lib/remote_table/version.rb +1 -1
  14. data/remote_table.gemspec +2 -4
  15. data/test/helper.rb +16 -1
  16. data/test/support/list-en1-semic-3.neooffice.binary.ods +0 -0
  17. data/test/support/list-en1-semic-3.neooffice.iso-8859-1.csv +0 -0
  18. data/test/support/list-en1-semic-3.neooffice.iso-8859-1.fixed_width-64 +0 -0
  19. data/test/support/list-en1-semic-3.neooffice.utf-8.csv +0 -0
  20. data/test/support/list-en1-semic-3.neooffice.utf-8.fixed_width-62 +0 -0
  21. data/test/support/list-en1-semic-3.neooffice.utf-8.html +0 -0
  22. data/test/support/list-en1-semic-3.neooffice.utf-8.xml +0 -0
  23. data/test/support/list-en1-semic-3.office-2011-for-mac-sp1-excel-95.binary.xls +0 -0
  24. data/test/support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xls +0 -0
  25. data/test/support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xlsx +0 -0
  26. data/test/support/list-en1-semic-3.office-2011-for-mac-sp1.iso-8859-1.html +0 -0
  27. data/test/support/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma +0 -0
  28. data/test/support/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html +0 -0
  29. data/test/support/list-en1-semic-3.original.iso-8859-1.csv +0 -0
  30. data/test/test_old_syntax.rb +1 -1
  31. data/test/test_old_transform.rb +26 -31
  32. data/test/test_remote_table.rb +34 -7
  33. metadata +37 -81
data/.gitattributes ADDED
@@ -0,0 +1 @@
1
+ list-en1-semic-3* -crlf -diff -merge
data/README.rdoc CHANGED
@@ -6,6 +6,13 @@ Open local or remote XLSX, XLS, ODS, CSV and fixed-width files.
6
6
 
7
7
  Used by http://data.brighterplanet.com and the data_miner gem (http://github.com/seamusabshere/data_miner)
8
8
 
9
+ ==Requirements
10
+
11
+ * POSIX operating system (not windows)
12
+ * curl, iconv, perl, cat, cut, tail, etc. accessible from /usr/local/bin:/usr/bin:/bin
13
+
14
+ As this library matures, those should go away.
15
+
9
16
  ==Example
10
17
 
11
18
  ?> t = RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/98guide6.zip', 'filename' => '98guide6.csv'
@@ -136,6 +143,11 @@ More examples:
136
143
  [ 'spacer', 1 ],
137
144
  [ 'header1', 10, { :type => :string } ]]
138
145
 
146
+ ==Helpful hints
147
+
148
+ * ASCII-8BIT is the same as BINARY
149
+ * ISO-8859-1 is the same as Latin1
150
+
139
151
  ==Custom parsers
140
152
 
141
153
  See the test file and also data_miner examples of custom parsers.
data/lib/remote_table.rb CHANGED
@@ -1,3 +1,8 @@
1
+ if ::RUBY_VERSION < '1.9' and $KCODE != 'UTF8'
2
+ $stderr.puts "[remote_table] Ruby 1.8 detected, setting $KCODE to UTF8 so that ActiveSupport::Multibyte works properly."
3
+ $KCODE = 'UTF8'
4
+ end
5
+
1
6
  require 'active_support'
2
7
  require 'active_support/version'
3
8
  %w{
@@ -16,6 +16,9 @@ class RemoteTable
16
16
 
17
17
  def backtick_with_reporting(cmd, raise_on_error = false)
18
18
  cmd = cmd.gsub /\n/m, ' '
19
+ if ::ENV['REMOTE_TABLE_DEBUG'] and ::ENV['REMOTE_TABLE_DEBUG'].include? 'backtick'
20
+ $stderr.puts "[remote_table] Executing #{cmd}"
21
+ end
19
22
  pid = ::POSIX::Spawn.spawn({ 'PATH' => '/usr/local/bin:/usr/bin:/bin' }, cmd)
20
23
  stat = ::Process::waitpid pid
21
24
  if raise_on_error and not stat.success?
@@ -24,17 +24,24 @@ class RemoteTable
24
24
  @t = t
25
25
  end
26
26
 
27
- def utf8(str)
27
+ def transliterate_to_utf8(str)
28
+ return if str.nil?
29
+ $stderr.puts "[remote_table translit] Before: #{str}" if ::ENV['REMOTE_TABLE_DEBUG'] and ::ENV['REMOTE_TABLE_DEBUG'].include?('translit')
30
+ transliterated_str = if ::RUBY_VERSION >= '1.9'
31
+ str.ensure_encoding t.properties.external_encoding, :external_encoding => t.properties.internal_encoding, :invalid_characters => :transcode
32
+ else
33
+ ::Iconv.conv(t.properties.external_encoding_iconv, t.properties.internal_encoding, str.to_s + ' ')[0..-2]
34
+ end
35
+ $stderr.puts "[remote_table translit] After: #{transliterated_str}" if ::ENV['REMOTE_TABLE_DEBUG'] and ::ENV['REMOTE_TABLE_DEBUG'].include?('translit')
36
+ transliterated_str
37
+ end
38
+
39
+ def assume_utf8(str)
40
+ return if str.nil?
28
41
  if ::RUBY_VERSION >= '1.9'
29
- str.ensure_encoding 'UTF-8', :external_encoding => t.properties.encoding, :invalid_characters => :transcode
42
+ str.encode! t.properties.external_encoding
30
43
  else
31
- return str if t.properties.encoding[0] =~ /utf.?8/i
32
- begin
33
- ::Iconv.conv('UTF-8//TRANSLIT', t.properties.encoding[0], str.to_s + ' ')[0..-2]
34
- rescue ::Iconv::IllegalSequence
35
- $stderr.puts "[remote_table] Unable to transliterate #{str} into UTF-8 given #{t.properties.encoding[0]}"
36
- str
37
- end
44
+ str
38
45
  end
39
46
  end
40
47
 
@@ -17,19 +17,21 @@ class RemoteTable
17
17
  include Textual
18
18
  def each(&blk)
19
19
  remove_useless_characters!
20
+ fix_newlines!
21
+ transliterate_whole_file_to_utf8!
20
22
  skip_rows!
21
- CSV.foreach(t.local_file.path, fastercsv_options) do |row|
23
+ CSV.new(t.local_file.encoded_io, fastercsv_options).each do |row|
22
24
  if row.is_a?(CSV::Row)
23
- output = row.inject(::ActiveSupport::OrderedHash.new) do |memo, (key, value)|
24
- if key.present?
25
- value = '' if value.nil?
26
- memo[key] = utf8 value
25
+ hash = row.inject(::ActiveSupport::OrderedHash.new) do |memo, (k, v)|
26
+ if k.present?
27
+ memo[k] = v.to_s
27
28
  end
28
29
  memo
29
30
  end
30
- yield output if t.properties.keep_blank_rows or output.any? { |k, v| v.present? }
31
- else
32
- yield row if t.properties.keep_blank_rows or row.any? { |v| v.present? }
31
+ yield hash if t.properties.keep_blank_rows or hash.any? { |k, v| v.present? }
32
+ elsif row.is_a?(::Array)
33
+ array = row.map { |v| v.to_s }
34
+ yield array if t.properties.keep_blank_rows or array.any? { |v| v.present? }
33
35
  end
34
36
  end
35
37
  ensure
@@ -1,17 +1,20 @@
1
- require 'slither'
1
+ require 'fixed_width'
2
+
2
3
  class RemoteTable
3
4
  class Format
4
5
  class FixedWidth < Format
5
6
  include Textual
6
7
  def each(&blk)
7
8
  remove_useless_characters!
9
+ fix_newlines!
10
+ transliterate_whole_file_to_utf8!
8
11
  crop_rows!
9
12
  skip_rows!
10
13
  cut_columns!
11
14
  parser.parse[:rows].each do |row|
12
15
  row.reject! { |k, v| k.blank? }
13
16
  row.each do |k, v|
14
- row[k] = utf8 v
17
+ row[k] = v.strip
15
18
  end
16
19
  yield row if t.properties.keep_blank_rows or row.any? { |k, v| v.present? }
17
20
  end
@@ -22,16 +25,20 @@ class RemoteTable
22
25
  private
23
26
 
24
27
  def parser
25
- @parser ||= ::Slither::Parser.new definition, t.local_file.path
28
+ return @parser if @parser.is_a?(::FixedWidth::Parser)
29
+ if ::FixedWidth::Section.private_instance_methods.map(&:to_sym).include?(:unpacker)
30
+ raise "[remote_table] You need a different (newer) version of the FixedWidth gem that supports multibyte encoding, sometime after https://github.com/timonk/fixed_width/pull/1 was incorporated"
31
+ end
32
+ @parser = ::FixedWidth::Parser.new definition, t.local_file.encoded_io
26
33
  end
27
34
 
28
35
  def definition
29
36
  @definition ||= if t.properties.schema_name.is_a?(::String) or t.properties.schema_name.is_a?(::Symbol)
30
- ::Slither.send :definition, t.properties.schema_name
37
+ ::FixedWidth.send :definition, t.properties.schema_name
31
38
  elsif t.properties.schema.is_a?(::Array)
32
39
  everything = lambda { |_| true }
33
40
  srand # in case this was forked by resque
34
- ::Slither.define(rand.to_s) do |d|
41
+ ::FixedWidth.define(rand.to_s) do |d|
35
42
  d.rows do |row|
36
43
  row.trap(&everything)
37
44
  t.properties.schema.each do |name, width, options|
@@ -4,7 +4,9 @@ class RemoteTable
4
4
  class Format
5
5
  module ProcessedByNokogiri
6
6
  def each
7
+ raise "[remote_table] Need :row_css or :row_xpath in order to process XML or HTML" unless t.properties.row_css or t.properties.row_xpath
7
8
  remove_useless_characters!
9
+ transliterate_whole_file_to_utf8!
8
10
  first_row = true
9
11
  keys = t.properties.headers if t.properties.headers.is_a?(::Array)
10
12
  xml = nokogiri_class.parse(unescaped_xml_without_soft_hyphens, nil, 'UTF-8')
@@ -15,7 +17,7 @@ class RemoteTable
15
17
  row.xpath(t.properties.column_xpath)
16
18
  else
17
19
  [row]
18
- end.map { |cell| cell.content.gsub(/\s+/, ' ').strip }
20
+ end.map { |cell| assume_utf8 cell.content.gsub(/\s+/, ' ').strip }
19
21
  if first_row and t.properties.use_first_row_as_header?
20
22
  keys = values
21
23
  first_row = false
@@ -57,7 +59,7 @@ class RemoteTable
57
59
 
58
60
  # should we be doing this in ruby?
59
61
  def unescaped_xml_without_soft_hyphens
60
- str = ::CGI.unescapeHTML utf8(::IO.read(t.local_file.path))
62
+ str = ::CGI.unescapeHTML t.local_file.encoded_io.read
61
63
  # get rid of MS Office baddies
62
64
  str.gsub! '&shy;', ''
63
65
  str
@@ -6,9 +6,9 @@ class RemoteTable
6
6
  spreadsheet = roo_class.new t.local_file.path, nil, :ignore
7
7
  spreadsheet.default_sheet = t.properties.sheet.is_a?(::Numeric) ? spreadsheet.sheets[t.properties.sheet] : t.properties.sheet
8
8
  if t.properties.output_class == ::Array
9
- (first_data_row..spreadsheet.last_row).each do |y|
9
+ (first_row..spreadsheet.last_row).each do |y|
10
10
  output = (1..spreadsheet.last_column).map do |x|
11
- spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
11
+ assume_utf8 spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
12
12
  end
13
13
  yield output if t.properties.keep_blank_rows or output.any? { |v| v.present? }
14
14
  end
@@ -16,18 +16,19 @@ class RemoteTable
16
16
  keys = {}
17
17
  if t.properties.use_first_row_as_header?
18
18
  (1..spreadsheet.last_column).each do |x|
19
- keys[x] = spreadsheet.cell(header_row, x)
20
- keys[x] = spreadsheet.cell(header_row - 1, x) if keys[x].blank? # look up
19
+ keys[x] = spreadsheet.cell(first_row, x)
20
+ keys[x] = spreadsheet.cell(first_row - 1, x) if keys[x].blank? # look up
21
+ keys[x] = assume_utf8 keys[x]
21
22
  end
22
23
  else
23
24
  (1..spreadsheet.last_column).each do |x|
24
- keys[x] = t.properties.headers[x - 1]
25
+ keys[x] = assume_utf8 t.properties.headers[x - 1]
25
26
  end
26
27
  end
27
- (first_data_row..spreadsheet.last_row).each do |y|
28
+ (first_row+1..spreadsheet.last_row).each do |y|
28
29
  output = (1..spreadsheet.last_column).inject(::ActiveSupport::OrderedHash.new) do |memo, x|
29
30
  if keys[x].present?
30
- memo[keys[x]] = spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
31
+ memo[keys[x]] = assume_utf8 spreadsheet.cell(y, x).to_s.gsub(/<[^>]+>/, '').strip
31
32
  end
32
33
  memo
33
34
  end
@@ -40,13 +41,9 @@ class RemoteTable
40
41
 
41
42
  private
42
43
 
43
- def header_row
44
+ def first_row
44
45
  1 + t.properties.skip
45
46
  end
46
-
47
- def first_data_row
48
- 1 + header_row
49
- end
50
47
  end
51
48
  end
52
49
  end
@@ -6,11 +6,22 @@ class RemoteTable
6
6
  USELESS_CHARACTERS = [
7
7
  '\xef\xbb\xbf', # UTF-8 byte order mark
8
8
  '\xc2\xad', # soft hyphen, often inserted by MS Office (html: &shy;)
9
- '\xad',
10
- # '\xa0'
11
9
  ]
12
10
  def remove_useless_characters!
13
11
  ::RemoteTable.executor.bang t.local_file.path, "perl -pe 's/#{USELESS_CHARACTERS.join '//g; s/'}//g'"
12
+ if t.properties.internal_encoding =~ /windows.?1252/i
13
+ # soft hyphen again, as I have seen it appear in windows 1252
14
+ ::RemoteTable.executor.bang t.local_file.path, %q{perl -pe 's/\xad//g'}
15
+ end
16
+ end
17
+
18
+ def transliterate_whole_file_to_utf8!
19
+ ::RemoteTable.executor.bang t.local_file.path, "iconv -c -f #{::Escape.shell_single_word t.properties.internal_encoding} -t #{::Escape.shell_single_word t.properties.external_encoding_iconv}"
20
+ t.properties.update 'encoding' => t.properties.external_encoding
21
+ end
22
+
23
+ def fix_newlines!
24
+ ::RemoteTable.executor.bang t.local_file.path, %q{perl -pe 's/\r\n|\n|\r/\n/g'}
14
25
  end
15
26
 
16
27
  def skip_rows!
@@ -15,8 +15,20 @@ class RemoteTable
15
15
  @path
16
16
  end
17
17
 
18
+ def encoded_io
19
+ @encoded_io ||= if ::RUBY_VERSION >= '1.9'
20
+ ::File.open path, 'rb', :internal_encoding => t.properties.internal_encoding, :external_encoding => t.properties.external_encoding
21
+ else
22
+ ::File.open path, 'rb'
23
+ end
24
+ end
25
+
18
26
  def delete
27
+ if @encoded_io.respond_to?(:closed?) and !@encoded_io.closed?
28
+ @encoded_io.close
29
+ end
19
30
  ::FileUtils.rm_rf staging_dir_path
31
+ @encoded_io = nil
20
32
  @path = nil
21
33
  @staging_dir_path = nil
22
34
  end
@@ -3,8 +3,15 @@ class RemoteTable
3
3
  # Represents the properties of a RemoteTable, whether they are explicitly set by the user or inferred automatically.
4
4
  class Properties
5
5
  attr_reader :t
6
+ attr_reader :current_options
7
+
6
8
  def initialize(t)
7
9
  @t = t
10
+ @current_options = t.options.dup
11
+ end
12
+
13
+ def update(options)
14
+ current_options.update options
8
15
  end
9
16
 
10
17
  # The parsed URI of the file to get.
@@ -22,19 +29,19 @@ class RemoteTable
22
29
  # * call each
23
30
  # Defaults to false.
24
31
  def streaming
25
- t.options['streaming'] || false
32
+ current_options['streaming'] || false
26
33
  end
27
34
 
28
35
  # Defaults to true.
29
36
  def warn_on_multiple_downloads
30
- t.options['warn_on_multiple_downloads'] != false
37
+ current_options['warn_on_multiple_downloads'] != false
31
38
  end
32
39
 
33
40
  # The headers specified by the user
34
41
  #
35
42
  # Default: :first_row
36
43
  def headers
37
- t.options['headers'].nil? ? :first_row : t.options['headers']
44
+ current_options['headers'].nil? ? :first_row : current_options['headers']
38
45
  end
39
46
 
40
47
  def use_first_row_as_header?
@@ -49,60 +56,65 @@ class RemoteTable
49
56
  #
50
57
  # Default: 0
51
58
  def sheet
52
- t.options['sheet'] || 0
59
+ current_options['sheet'] || 0
53
60
  end
54
61
 
55
62
  # Whether to keep blank rows
56
63
  #
57
64
  # Default: false
58
65
  def keep_blank_rows
59
- t.options['keep_blank_rows'] || false
66
+ current_options['keep_blank_rows'] || false
60
67
  end
61
68
 
62
69
  # Form data to send in with the download request
63
70
  def form_data
64
- t.options['form_data']
71
+ current_options['form_data']
65
72
  end
66
73
 
67
74
  # How many rows to skip
68
75
  #
69
76
  # Default: 0
70
77
  def skip
71
- t.options['skip'].to_i
78
+ current_options['skip'].to_i
72
79
  end
73
80
 
74
- # Likely external encoding
75
- #
76
- # Default: "UTF-8"
77
- def encoding
78
- @encoding ||= ::Array.wrap(t.options['encoding'] || [ 'ISO-8859-1', 'US-ASCII', 'WINDOWS-1252', 'ASCII-8BIT', 'UTF-8' ])
81
+ def internal_encoding
82
+ (current_options['encoding'] || 'UTF-8').upcase
83
+ end
84
+
85
+ def external_encoding
86
+ 'UTF-8'
87
+ end
88
+
89
+ def external_encoding_iconv
90
+ 'UTF-8//TRANSLIT'
79
91
  end
80
92
 
81
93
  # The delimiter
82
94
  #
83
95
  # Default: ","
84
96
  def delimiter
85
- t.options['delimiter'] || ','
97
+ current_options['delimiter'] || ','
86
98
  end
87
99
 
88
100
  # The XPath used to find rows
89
101
  def row_xpath
90
- t.options['row_xpath']
102
+ current_options['row_xpath']
91
103
  end
92
104
 
93
105
  # The XPath used to find columns
94
106
  def column_xpath
95
- t.options['column_xpath']
107
+ current_options['column_xpath']
96
108
  end
97
109
 
98
110
  # The CSS selector used to find rows
99
111
  def row_css
100
- t.options['row_css']
112
+ current_options['row_css']
101
113
  end
102
114
 
103
115
  # The CSS selector used to find columns
104
116
  def column_css
105
- t.options['column_css']
117
+ current_options['column_css']
106
118
  end
107
119
 
108
120
  # The compression type.
@@ -111,8 +123,8 @@ class RemoteTable
111
123
  #
112
124
  # Can be specified as: "gz", "zip", "bz2", "exe" (treated as "zip")
113
125
  def compression
114
- clue = if t.options['compression']
115
- t.options['compression'].to_s
126
+ clue = if current_options['compression']
127
+ current_options['compression'].to_s
116
128
  else
117
129
  ::File.extname uri.path
118
130
  end
@@ -134,8 +146,8 @@ class RemoteTable
134
146
  #
135
147
  # Can be specified as: "tar"
136
148
  def packing
137
- clue = if t.options['packing']
138
- t.options['packing'].to_s
149
+ clue = if current_options['packing']
150
+ current_options['packing'].to_s
139
151
  else
140
152
  ::File.extname(uri.path.sub(/\.#{compression}\z/, ''))
141
153
  end
@@ -150,7 +162,7 @@ class RemoteTable
150
162
  # Example:
151
163
  # RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', 'glob' => '/*.csv'
152
164
  def glob
153
- t.options['glob']
165
+ current_options['glob']
154
166
  end
155
167
 
156
168
  # The filename, which can be used to pick a file out of an archive.
@@ -158,17 +170,17 @@ class RemoteTable
158
170
  # Example:
159
171
  # RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', 'filename' => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv'
160
172
  def filename
161
- t.options['filename']
173
+ current_options['filename']
162
174
  end
163
175
 
164
176
  # Cut columns up to this character
165
177
  def cut
166
- t.options['cut']
178
+ current_options['cut']
167
179
  end
168
180
 
169
181
  # Crop rows after this line
170
182
  def crop
171
- t.options['crop']
183
+ current_options['crop']
172
184
  end
173
185
 
174
186
  # The fixed-width schema, given as an array
@@ -183,31 +195,31 @@ class RemoteTable
183
195
  # [ 'spacer', 12 ],
184
196
  # [ 'header6', 10, { :type => :string } ]])
185
197
  def schema
186
- t.options['schema']
198
+ current_options['schema']
187
199
  end
188
200
 
189
- # The name of the fixed-width schema according to Slither
201
+ # The name of the fixed-width schema according to FixedWidth
190
202
  def schema_name
191
- t.options['schema_name']
203
+ current_options['schema_name']
192
204
  end
193
205
 
194
206
  # A proc to call to decide whether to return a row.
195
207
  def select
196
- t.options['select']
208
+ current_options['select']
197
209
  end
198
210
 
199
211
  # A proc to call to decide whether to return a row.
200
212
  def reject
201
- t.options['reject']
213
+ current_options['reject']
202
214
  end
203
215
 
204
216
  # A hash of options to create a new Errata instance (see the Errata gem at http://github.com/seamusabshere/errata) to be used on every row.
205
217
  def errata
206
- return unless t.options.has_key? 'errata'
207
- @errata ||= if t.options['errata'].is_a? ::Hash
208
- ::Errata.new t.options['errata']
218
+ return unless current_options.has_key? 'errata'
219
+ @errata ||= if current_options['errata'].is_a? ::Hash
220
+ ::Errata.new current_options['errata']
209
221
  else
210
- t.options['errata']
222
+ current_options['errata']
211
223
  end
212
224
  end
213
225
 
@@ -220,8 +232,8 @@ class RemoteTable
220
232
  # Can be specified as: "xlsx", "xls", "csv", "ods", "fixed_width", "html"
221
233
  def format
222
234
  return Format::Delimited if uri.host == 'spreadsheets.google.com'
223
- clue = if t.options['format']
224
- t.options['format'].to_s
235
+ clue = if current_options['format']
236
+ current_options['format'].to_s
225
237
  else
226
238
  ::File.extname t.local_file.path
227
239
  end
@@ -1,3 +1,3 @@
1
1
  class RemoteTable
2
- VERSION = "1.2.2"
2
+ VERSION = "1.2.3"
3
3
  end
data/remote_table.gemspec CHANGED
@@ -21,7 +21,7 @@ Gem::Specification.new do |s|
21
21
 
22
22
  s.add_dependency 'activesupport', '>=2.3.4'
23
23
  s.add_dependency 'roo', '~>1.9'
24
- s.add_dependency 'slither', '>=0.99.4'
24
+ s.add_dependency 'fixed_width-multibyte' # TODO replace with fixed_width once timon gets off vacation
25
25
  s.add_dependency 'i18n' # activesupport?
26
26
  s.add_dependency 'builder' # roo?
27
27
  s.add_dependency 'zip' # roo
@@ -31,9 +31,7 @@ Gem::Specification.new do |s|
31
31
  s.add_dependency 'escape', '>=0.0.4'
32
32
  s.add_dependency 'posix-spawn'
33
33
  s.add_dependency 'ensure-encoding'
34
- unless RUBY_VERSION >= '1.9'
35
- s.add_dependency 'fastercsv', '>=1.5.0'
36
- end
34
+ s.add_dependency 'fastercsv', '>=1.5.0'
37
35
 
38
36
  s.add_development_dependency 'errata', '>=0.2.0'
39
37
  s.add_development_dependency 'test-unit'
data/test/helper.rb CHANGED
@@ -4,11 +4,26 @@ Bundler.setup
4
4
  require 'test/unit'
5
5
  require 'shoulda'
6
6
  require 'ruby-debug'
7
- require 'tempfile'
8
7
 
9
8
  $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
10
9
  $LOAD_PATH.unshift(File.dirname(__FILE__))
11
10
  require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'remote_table'))
12
11
 
13
12
  class Test::Unit::TestCase
13
+ def setup
14
+ if RUBY_VERSION >= '1.9'
15
+ @old_default_internal = Encoding.default_internal
16
+ @old_default_external = Encoding.default_external
17
+ # totally random choices here
18
+ Encoding.default_internal = 'EUC-JP'
19
+ Encoding.default_external = 'Shift_JIS'
20
+ end
21
+ end
22
+
23
+ def teardown
24
+ if RUBY_VERSION >= '1.9'
25
+ Encoding.default_internal = @old_default_internal
26
+ Encoding.default_external = @old_default_external
27
+ end
28
+ end
14
29
  end
@@ -16,7 +16,7 @@ $test2_rows.freeze
16
16
  class TestOldSyntax < Test::Unit::TestCase
17
17
  should "open an XLSX like an array (numbered columns)" do
18
18
  t = RemoteTable.new(:url => 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :headers => false)
19
- assert_equal "Secure encryption of all data", t.rows[5][0]
19
+ assert_equal "Software-As-A-Service", t.rows[5][0]
20
20
  end
21
21
 
22
22
  should "open an XLSX with custom headers" do
@@ -1,36 +1,31 @@
1
1
  require 'helper'
2
2
 
3
- class FuelOilParser
3
+ class NaturalGasParser
4
4
  def initialize(options = {})
5
5
  # nothing
6
6
  end
7
- def add_hints!(bus)
8
- bus[:sheet] = 'Data 1'
9
- bus[:skip] = 2
10
- bus[:select] = lambda { |row| row['year'] > 1989 }
11
- end
12
7
  def apply(row)
13
8
  virtual_rows = []
14
- row.keys.grep(/(.+) Residual Fuel Oil/) do |location_column_name|
15
- first_part = $1
16
- next if (cost = row[location_column_name]).blank? or (date = row['Date']).blank?
17
- if first_part.start_with?('U.S.')
18
- locatable = "united_states (Country)"
19
- elsif first_part.include?('PADD')
20
- /\(PADD (.*)\)/.match(first_part)
21
- padd_part = $1
22
- next if padd_part == '1' # skip PADD 1 because we always prefer subdistricts
23
- locatable = "#{padd_part} (PetroleumAdministrationForDefenseDistrict)"
9
+ row.keys.grep(/\A(.*) Natural Gas/) do |location_column_name|
10
+ match_1 = $1
11
+ next if (price = row[location_column_name]).blank? or (date = row['Date']).blank?
12
+ if match_1 == 'U.S.'
13
+ locatable_id = 'US'
14
+ locatable_type = 'Country'
24
15
  else
25
- locatable = "#{first_part} (State)"
16
+ locatable_id = match_1 # name
17
+ locatable_type = 'State'
26
18
  end
27
19
  date = Time.parse(date)
28
- virtual_rows << {
29
- 'locatable' => locatable,
30
- 'cost' => cost,
31
- 'year' => date.year,
32
- 'month' => date.month
33
- }
20
+ new_row = ActiveSupport::OrderedHash.new
21
+ new_row['locatable_id'] = locatable_id
22
+ new_row['locatable_type'] = locatable_type
23
+ new_row['price'] = price
24
+ new_row['year'] = date.year
25
+ new_row['month'] = date.month
26
+ row_hash = RemoteTable::Transform.row_hash new_row
27
+ new_row['row_hash'] = row_hash
28
+ virtual_rows << new_row
34
29
  end
35
30
  virtual_rows
36
31
  end
@@ -38,12 +33,12 @@ end
38
33
 
39
34
  class TestOldTransform < Test::Unit::TestCase
40
35
  should "open an XLS with a parser" do
41
- ma_1990_01 = {"month"=>1, "cost"=>"54.0", "locatable"=>"Massachusetts (State)", "year"=>1990}
42
- ga_1990_01 = {"month"=>1, "cost"=>"50.7", "locatable"=>"Georgia (State)", "year"=>1990}
43
-
44
- t = RemoteTable.new(:url => 'http://tonto.eia.doe.gov/dnav/pet/xls/PET_PRI_RESID_A_EPPR_PTA_CPGAL_M.xls',
45
- :transform => { :class => FuelOilParser })
46
- assert t.rows.include?(ma_1990_01)
47
- assert t.rows.include?(ga_1990_01)
36
+ t = RemoteTable.new(:url => 'http://tonto.eia.doe.gov/dnav/ng/xls/ng_pri_sum_a_EPG0_FWA_DMcf_a.xls',
37
+ :sheet => 'Data 1',
38
+ :skip => 2,
39
+ :select => lambda { |row| row['year'].to_i > 1989 },
40
+ :transform => { :class => NaturalGasParser })
41
+ assert_equal 'Country', t[0]['locatable_type']
42
+ assert_equal 'US', t[0]['locatable_id']
48
43
  end
49
- end
44
+ end
@@ -63,12 +63,12 @@ class TestRemoteTable < Test::Unit::TestCase
63
63
 
64
64
  # fixes ArgumentError: invalid byte sequence in UTF-8
65
65
  should %{safely strip soft hyphens and read windows-1252 html} do
66
- t = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-A.htm", :row_xpath => '//table/tr[2]/td/table/tr', :column_xpath => 'td'
66
+ t = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-A.htm", :row_xpath => '//table/tr[2]/td/table/tr', :column_xpath => 'td', :encoding => 'windows-1252'
67
67
  assert t.rows.detect { |row| row['Model'] == 'A300B4600' }
68
68
  end
69
69
 
70
70
  should %{transliterate characters from ISO-8859-1} do
71
- t = RemoteTable.new :url => 'http://static.brighterplanet.com/science/data/consumables/pets/breed_genders.csv'
71
+ t = RemoteTable.new :url => 'http://static.brighterplanet.com/science/data/consumables/pets/breed_genders.csv', :encoding => 'ISO-8859-1'
72
72
  assert t.rows.detect { |row| row['name'] == 'Briquet Griffon Vendéen' }
73
73
  end
74
74
 
@@ -86,15 +86,42 @@ class TestRemoteTable < Test::Unit::TestCase
86
86
  assert(time1 != time2)
87
87
  end
88
88
 
89
- should %{not die when it reads Åland Islands} do
90
- t = RemoteTable.new 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';'
91
- assert_nothing_raised do
92
- t[1][0]
89
+ {
90
+ # IMPOSSIBLE "../support/list-en1-semic-3.office-2011-for-mac-sp1-excel-95.binary.xls" => {:format=>"xls", :encoding=>"binary"},
91
+ "../support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xlsx" => {:format=>"xlsx"},
92
+ "../support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xls" => {:format=>"xls"},
93
+ "../support/list-en1-semic-3.neooffice.binary.ods" => {:format=>"ods"},
94
+ "../support/list-en1-semic-3.neooffice.iso-8859-1.fixed_width-64" => {:format=>"fixed_width", :encoding=>"iso-8859-1", :schema => [['name', 63, { :type => :string }], ['iso_3166', 2, { :type => :string }]]},
95
+ "../support/list-en1-semic-3.neooffice.utf-8.fixed_width-62" => {:format=>"fixed_width", :schema => [['name', 61, { :type => :string }], ['iso_3166', 2, { :type => :string }]]},
96
+ # TODO "../support/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html" => {:format=>"html" },
97
+ # TODO "../support/list-en1-semic-3.office-2011-for-mac-sp1.iso-8859-1.html" => {:format=>"html", :encoding=>"iso-8859-1"},
98
+ # TODO "../support/list-en1-semic-3.neooffice.utf-8.html" => {:format=>"html" },
99
+ "../support/list-en1-semic-3.neooffice.utf-8.xml" => {:format=>"xml", :row_css=>'Row', :column_css => 'Data', :select => lambda { |row| row[1].to_s =~ /[A-Z]{2}/ }},
100
+ "../support/list-en1-semic-3.neooffice.iso-8859-1.csv" => {:format=>"csv", :encoding=>"iso-8859-1", :delimiter => ';'},
101
+ "../support/list-en1-semic-3.original.iso-8859-1.csv" => {:format=>"csv", :encoding=>"iso-8859-1", :delimiter => ';'},
102
+ "../support/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma" => {:format=>"csv", :encoding=>"MACROMAN"}, # comma because no option in excel
103
+ "../support/list-en1-semic-3.neooffice.utf-8.csv" => {:format=>"csv", :delimiter => ';'}
104
+ }.each do |k, v|
105
+ should %{open #{k} with encoding #{v[:encoding] || 'default'}} do
106
+ options = v.merge(:headers => false, :skip => 2)
107
+ t = RemoteTable.new "file://#{File.expand_path(k, __FILE__)}", options
108
+ a = %{ÅLAND ISLANDS}
109
+ b = (t[1].is_a?(::Array) ? t[1][0] : t[1]['name'])
110
+ if RUBY_VERSION >= '1.9'
111
+ assert_equal 'UTF-8', a.encoding.to_s
112
+ assert_equal 'UTF-8', b.encoding.to_s
113
+ end
114
+ assert_equal a, b
93
115
  end
94
116
  end
95
117
 
118
+ should %{recode as UTF-8 even ISO-8859-1 (or any other encoding)} do
119
+ t = RemoteTable.new 'http://www.iso.org/iso/list-en1-semic-3.txt', :skip => 2, :headers => false, :delimiter => ';', :encoding => 'ISO-8859-1'
120
+ assert_equal %{ÅLAND ISLANDS}, t[1][0]
121
+ end
122
+
96
123
  should %{parse a big CSV that is not UTF-8} do
97
- t = RemoteTable.new 'https://openflights.svn.sourceforge.net/svnroot/openflights/openflights/data/airports.dat', :headers => false
124
+ t = RemoteTable.new 'https://openflights.svn.sourceforge.net/svnroot/openflights/openflights/data/airports.dat', :headers => false#, :encoding => 'UTF-8'
98
125
  assert_equal 'Goroka', t[0][1]
99
126
  end
100
127
  end
metadata CHANGED
@@ -1,13 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remote_table
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
5
4
  prerelease:
6
- segments:
7
- - 1
8
- - 2
9
- - 2
10
- version: 1.2.2
5
+ version: 1.2.3
11
6
  platform: ruby
12
7
  authors:
13
8
  - Seamus Abshere
@@ -16,7 +11,8 @@ autorequire:
16
11
  bindir: bin
17
12
  cert_chain: []
18
13
 
19
- date: 2011-05-05 00:00:00 Z
14
+ date: 2011-05-21 00:00:00 -05:00
15
+ default_executable:
20
16
  dependencies:
21
17
  - !ruby/object:Gem::Dependency
22
18
  name: activesupport
@@ -26,11 +22,6 @@ dependencies:
26
22
  requirements:
27
23
  - - ">="
28
24
  - !ruby/object:Gem::Version
29
- hash: 11
30
- segments:
31
- - 2
32
- - 3
33
- - 4
34
25
  version: 2.3.4
35
26
  type: :runtime
36
27
  version_requirements: *id001
@@ -42,27 +33,18 @@ dependencies:
42
33
  requirements:
43
34
  - - ~>
44
35
  - !ruby/object:Gem::Version
45
- hash: 29
46
- segments:
47
- - 1
48
- - 9
49
36
  version: "1.9"
50
37
  type: :runtime
51
38
  version_requirements: *id002
52
39
  - !ruby/object:Gem::Dependency
53
- name: slither
40
+ name: fixed_width-multibyte
54
41
  prerelease: false
55
42
  requirement: &id003 !ruby/object:Gem::Requirement
56
43
  none: false
57
44
  requirements:
58
45
  - - ">="
59
46
  - !ruby/object:Gem::Version
60
- hash: 411
61
- segments:
62
- - 0
63
- - 99
64
- - 4
65
- version: 0.99.4
47
+ version: "0"
66
48
  type: :runtime
67
49
  version_requirements: *id003
68
50
  - !ruby/object:Gem::Dependency
@@ -73,9 +55,6 @@ dependencies:
73
55
  requirements:
74
56
  - - ">="
75
57
  - !ruby/object:Gem::Version
76
- hash: 3
77
- segments:
78
- - 0
79
58
  version: "0"
80
59
  type: :runtime
81
60
  version_requirements: *id004
@@ -87,9 +66,6 @@ dependencies:
87
66
  requirements:
88
67
  - - ">="
89
68
  - !ruby/object:Gem::Version
90
- hash: 3
91
- segments:
92
- - 0
93
69
  version: "0"
94
70
  type: :runtime
95
71
  version_requirements: *id005
@@ -101,9 +77,6 @@ dependencies:
101
77
  requirements:
102
78
  - - ">="
103
79
  - !ruby/object:Gem::Version
104
- hash: 3
105
- segments:
106
- - 0
107
80
  version: "0"
108
81
  type: :runtime
109
82
  version_requirements: *id006
@@ -115,11 +88,6 @@ dependencies:
115
88
  requirements:
116
89
  - - ">="
117
90
  - !ruby/object:Gem::Version
118
- hash: 5
119
- segments:
120
- - 1
121
- - 4
122
- - 1
123
91
  version: 1.4.1
124
92
  type: :runtime
125
93
  version_requirements: *id007
@@ -131,9 +99,6 @@ dependencies:
131
99
  requirements:
132
100
  - - ">="
133
101
  - !ruby/object:Gem::Version
134
- hash: 3
135
- segments:
136
- - 0
137
102
  version: "0"
138
103
  type: :runtime
139
104
  version_requirements: *id008
@@ -145,9 +110,6 @@ dependencies:
145
110
  requirements:
146
111
  - - ">="
147
112
  - !ruby/object:Gem::Version
148
- hash: 3
149
- segments:
150
- - 0
151
113
  version: "0"
152
114
  type: :runtime
153
115
  version_requirements: *id009
@@ -159,11 +121,6 @@ dependencies:
159
121
  requirements:
160
122
  - - ">="
161
123
  - !ruby/object:Gem::Version
162
- hash: 23
163
- segments:
164
- - 0
165
- - 0
166
- - 4
167
124
  version: 0.0.4
168
125
  type: :runtime
169
126
  version_requirements: *id010
@@ -175,9 +132,6 @@ dependencies:
175
132
  requirements:
176
133
  - - ">="
177
134
  - !ruby/object:Gem::Version
178
- hash: 3
179
- segments:
180
- - 0
181
135
  version: "0"
182
136
  type: :runtime
183
137
  version_requirements: *id011
@@ -189,9 +143,6 @@ dependencies:
189
143
  requirements:
190
144
  - - ">="
191
145
  - !ruby/object:Gem::Version
192
- hash: 3
193
- segments:
194
- - 0
195
146
  version: "0"
196
147
  type: :runtime
197
148
  version_requirements: *id012
@@ -203,11 +154,6 @@ dependencies:
203
154
  requirements:
204
155
  - - ">="
205
156
  - !ruby/object:Gem::Version
206
- hash: 3
207
- segments:
208
- - 1
209
- - 5
210
- - 0
211
157
  version: 1.5.0
212
158
  type: :runtime
213
159
  version_requirements: *id013
@@ -219,11 +165,6 @@ dependencies:
219
165
  requirements:
220
166
  - - ">="
221
167
  - !ruby/object:Gem::Version
222
- hash: 23
223
- segments:
224
- - 0
225
- - 2
226
- - 0
227
168
  version: 0.2.0
228
169
  type: :development
229
170
  version_requirements: *id014
@@ -235,9 +176,6 @@ dependencies:
235
176
  requirements:
236
177
  - - ">="
237
178
  - !ruby/object:Gem::Version
238
- hash: 3
239
- segments:
240
- - 0
241
179
  version: "0"
242
180
  type: :development
243
181
  version_requirements: *id015
@@ -249,23 +187,17 @@ dependencies:
249
187
  requirements:
250
188
  - - ">="
251
189
  - !ruby/object:Gem::Version
252
- hash: 3
253
- segments:
254
- - 0
255
190
  version: "0"
256
191
  type: :development
257
192
  version_requirements: *id016
258
193
  - !ruby/object:Gem::Dependency
259
- name: ruby-debug
194
+ name: ruby-debug19
260
195
  prerelease: false
261
196
  requirement: &id017 !ruby/object:Gem::Requirement
262
197
  none: false
263
198
  requirements:
264
199
  - - ">="
265
200
  - !ruby/object:Gem::Version
266
- hash: 3
267
- segments:
268
- - 0
269
201
  version: "0"
270
202
  type: :development
271
203
  version_requirements: *id017
@@ -280,6 +212,7 @@ extra_rdoc_files: []
280
212
 
281
213
  files:
282
214
  - .document
215
+ - .gitattributes
283
216
  - .gitignore
284
217
  - Gemfile
285
218
  - LICENSE
@@ -305,11 +238,26 @@ files:
305
238
  - lib/remote_table/version.rb
306
239
  - remote_table.gemspec
307
240
  - test/helper.rb
241
+ - test/support/list-en1-semic-3.neooffice.binary.ods
242
+ - test/support/list-en1-semic-3.neooffice.iso-8859-1.csv
243
+ - test/support/list-en1-semic-3.neooffice.iso-8859-1.fixed_width-64
244
+ - test/support/list-en1-semic-3.neooffice.utf-8.csv
245
+ - test/support/list-en1-semic-3.neooffice.utf-8.fixed_width-62
246
+ - test/support/list-en1-semic-3.neooffice.utf-8.html
247
+ - test/support/list-en1-semic-3.neooffice.utf-8.xml
248
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1-excel-95.binary.xls
249
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xls
250
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xlsx
251
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.iso-8859-1.html
252
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma
253
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html
254
+ - test/support/list-en1-semic-3.original.iso-8859-1.csv
308
255
  - test/test_big.rb
309
256
  - test/test_errata.rb
310
257
  - test/test_old_syntax.rb
311
258
  - test/test_old_transform.rb
312
259
  - test/test_remote_table.rb
260
+ has_rdoc: true
313
261
  homepage: https://github.com/seamusabshere/remote_table
314
262
  licenses: []
315
263
 
@@ -323,28 +271,36 @@ required_ruby_version: !ruby/object:Gem::Requirement
323
271
  requirements:
324
272
  - - ">="
325
273
  - !ruby/object:Gem::Version
326
- hash: 3
327
- segments:
328
- - 0
329
274
  version: "0"
330
275
  required_rubygems_version: !ruby/object:Gem::Requirement
331
276
  none: false
332
277
  requirements:
333
278
  - - ">="
334
279
  - !ruby/object:Gem::Version
335
- hash: 3
336
- segments:
337
- - 0
338
280
  version: "0"
339
281
  requirements: []
340
282
 
341
283
  rubyforge_project: remotetable
342
- rubygems_version: 1.7.2
284
+ rubygems_version: 1.6.2
343
285
  signing_key:
344
286
  specification_version: 3
345
287
  summary: Open local or remote XLSX, XLS, ODS, CSV and fixed-width files.
346
288
  test_files:
347
289
  - test/helper.rb
290
+ - test/support/list-en1-semic-3.neooffice.binary.ods
291
+ - test/support/list-en1-semic-3.neooffice.iso-8859-1.csv
292
+ - test/support/list-en1-semic-3.neooffice.iso-8859-1.fixed_width-64
293
+ - test/support/list-en1-semic-3.neooffice.utf-8.csv
294
+ - test/support/list-en1-semic-3.neooffice.utf-8.fixed_width-62
295
+ - test/support/list-en1-semic-3.neooffice.utf-8.html
296
+ - test/support/list-en1-semic-3.neooffice.utf-8.xml
297
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1-excel-95.binary.xls
298
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xls
299
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.binary.xlsx
300
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.iso-8859-1.html
301
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.mac.csv-comma
302
+ - test/support/list-en1-semic-3.office-2011-for-mac-sp1.utf-8.html
303
+ - test/support/list-en1-semic-3.original.iso-8859-1.csv
348
304
  - test/test_big.rb
349
305
  - test/test_errata.rb
350
306
  - test/test_old_syntax.rb