remote_table 0.2.22 → 0.2.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.22
1
+ 0.2.23
@@ -3,6 +3,7 @@ class RemoteTable
3
3
  def each_row(&block)
4
4
  backup_file!
5
5
  convert_file_to_utf8!
6
+ remove_useless_characters!
6
7
  skip_rows!
7
8
  FasterCSV.foreach(path, fastercsv_options) do |row|
8
9
  ordered_hash = ActiveSupport::OrderedHash.new
@@ -3,6 +3,7 @@ class RemoteTable
3
3
  def each_row(&block)
4
4
  backup_file!
5
5
  convert_file_to_utf8!
6
+ remove_useless_characters!
6
7
  crop_rows!
7
8
  skip_rows!
8
9
  cut_columns!
@@ -3,6 +3,7 @@ class RemoteTable
3
3
  def each_row(&block)
4
4
  backup_file!
5
5
  convert_file_to_utf8!
6
+ remove_useless_characters!
6
7
  html_headers = (headers.is_a?(Array)) ? headers : nil
7
8
  Nokogiri::HTML(unescaped_html_without_soft_hyphens, nil, 'UTF-8').xpath(row_xpath).each do |row|
8
9
  values = row.xpath(column_xpath).map { |td| td.content.gsub(/\s+/, ' ').strip }
@@ -29,7 +30,7 @@ class RemoteTable
29
30
  # should we be doing this in ruby?
30
31
  def unescaped_html_without_soft_hyphens
31
32
  str = CGI.unescapeHTML IO.read(path)
32
- str.gsub! /­|\302\255/, ''
33
+ str.gsub! /­/, ''
33
34
  str
34
35
  end
35
36
  end
@@ -63,8 +63,16 @@ class RemoteTable
63
63
  FileUtils.mv "#{path}.tmp", path
64
64
  end
65
65
 
66
+ USELESS_CHARACTERS = [
67
+ '\xef\xbb\xbf', # UTF-8 byte order mark
68
+ '\xc2\xad' # soft hyphen, often inserted by MS Office (html: ­)
69
+ ]
70
+ def remove_useless_characters!
71
+ RemoteTable.backtick_with_reporting "perl -pe 's/#{USELESS_CHARACTERS.join '//g; s/'}//g' #{path} > #{path}.tmp"
72
+ FileUtils.mv "#{path}.tmp", path
73
+ end
74
+
66
75
  def convert_file_to_utf8!
67
- return if encoding == 'UTF-8' or encoding == 'UTF8'
68
76
  RemoteTable.backtick_with_reporting "iconv -c -f #{encoding} -t UTF-8 #{path} > #{path}.tmp"
69
77
  FileUtils.mv "#{path}.tmp", path
70
78
  end
data/remote_table.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{remote_table}
8
- s.version = "0.2.22"
8
+ s.version = "0.2.23"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
12
- s.date = %q{2010-05-17}
12
+ s.date = %q{2010-05-21}
13
13
  s.description = %q{Remotely open and parse Excel XLS, ODS, CSV and fixed-width tables.}
14
14
  s.email = %q{seamus@abshere.net}
15
15
  s.extra_rdoc_files = [
@@ -128,6 +128,14 @@ class RemoteTableTest < Test::Unit::TestCase
128
128
  end
129
129
 
130
130
  if ENV['ALL'] == 'true' or ENV['NEW'] == 'true'
131
+ end
132
+
133
+ if ENV['ALL'] == 'true' or ENV['FAST'] == 'true'
134
+ should "ignore UTF-8 byte order marks" do
135
+ t = RemoteTable.new :url => 'http://www.freebase.com/type/exporttypeinstances/base/horses/horse_breed?page=0&filter_mode=type&filter_view=table&show%01p%3D%2Ftype%2Fobject%2Fname%01index=0&show%01p%3D%2Fcommon%2Ftopic%2Fimage%01index=1&show%01p%3D%2Fcommon%2Ftopic%2Farticle%01index=2&sort%01p%3D%2Ftype%2Fobject%2Ftype%01p%3Dlink%01p%3D%2Ftype%2Flink%2Ftimestamp%01index=false&=&exporttype=csv-8'
136
+ assert_equal 'Tawleed', t.rows.first['name']
137
+ end
138
+
131
139
  should "be able to apply errata files" do
132
140
  t = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-G.htm",
133
141
  :encoding => 'windows-1252',
@@ -140,9 +148,7 @@ class RemoteTableTest < Test::Unit::TestCase
140
148
  assert_equal 'GRUMMAN', g1['Manufacturer']
141
149
  assert_equal 'G159 Gulfstream I (TC4 Academe, VC4)', g1['Model']
142
150
  end
143
- end
144
-
145
- if ENV['ALL'] == 'true' or ENV['FAST'] == 'true'
151
+
146
152
  # this will die with an error about libcurl if your curl doesn't support ssl
147
153
  should "connect using HTTPS if available" do
148
154
  t = RemoteTable.new(:url => 'https://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA')
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 2
8
- - 22
9
- version: 0.2.22
8
+ - 23
9
+ version: 0.2.23
10
10
  platform: ruby
11
11
  authors:
12
12
  - Seamus Abshere
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-05-17 00:00:00 -04:00
18
+ date: 2010-05-21 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency