remote_table 0.2.22 → 0.2.23

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.22
1
+ 0.2.23
@@ -3,6 +3,7 @@ class RemoteTable
3
3
  def each_row(&block)
4
4
  backup_file!
5
5
  convert_file_to_utf8!
6
+ remove_useless_characters!
6
7
  skip_rows!
7
8
  FasterCSV.foreach(path, fastercsv_options) do |row|
8
9
  ordered_hash = ActiveSupport::OrderedHash.new
@@ -3,6 +3,7 @@ class RemoteTable
3
3
  def each_row(&block)
4
4
  backup_file!
5
5
  convert_file_to_utf8!
6
+ remove_useless_characters!
6
7
  crop_rows!
7
8
  skip_rows!
8
9
  cut_columns!
@@ -3,6 +3,7 @@ class RemoteTable
3
3
  def each_row(&block)
4
4
  backup_file!
5
5
  convert_file_to_utf8!
6
+ remove_useless_characters!
6
7
  html_headers = (headers.is_a?(Array)) ? headers : nil
7
8
  Nokogiri::HTML(unescaped_html_without_soft_hyphens, nil, 'UTF-8').xpath(row_xpath).each do |row|
8
9
  values = row.xpath(column_xpath).map { |td| td.content.gsub(/\s+/, ' ').strip }
@@ -29,7 +30,7 @@ class RemoteTable
29
30
  # should we be doing this in ruby?
30
31
  def unescaped_html_without_soft_hyphens
31
32
  str = CGI.unescapeHTML IO.read(path)
32
- str.gsub! /­|\302\255/, ''
33
+ str.gsub! /­/, ''
33
34
  str
34
35
  end
35
36
  end
@@ -63,8 +63,16 @@ class RemoteTable
63
63
  FileUtils.mv "#{path}.tmp", path
64
64
  end
65
65
 
66
+ USELESS_CHARACTERS = [
67
+ '\xef\xbb\xbf', # UTF-8 byte order mark
68
+ '\xc2\xad' # soft hyphen, often inserted by MS Office (html: ­)
69
+ ]
70
+ def remove_useless_characters!
71
+ RemoteTable.backtick_with_reporting "perl -pe 's/#{USELESS_CHARACTERS.join '//g; s/'}//g' #{path} > #{path}.tmp"
72
+ FileUtils.mv "#{path}.tmp", path
73
+ end
74
+
66
75
  def convert_file_to_utf8!
67
- return if encoding == 'UTF-8' or encoding == 'UTF8'
68
76
  RemoteTable.backtick_with_reporting "iconv -c -f #{encoding} -t UTF-8 #{path} > #{path}.tmp"
69
77
  FileUtils.mv "#{path}.tmp", path
70
78
  end
data/remote_table.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{remote_table}
8
- s.version = "0.2.22"
8
+ s.version = "0.2.23"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Seamus Abshere", "Andy Rossmeissl"]
12
- s.date = %q{2010-05-17}
12
+ s.date = %q{2010-05-21}
13
13
  s.description = %q{Remotely open and parse Excel XLS, ODS, CSV and fixed-width tables.}
14
14
  s.email = %q{seamus@abshere.net}
15
15
  s.extra_rdoc_files = [
@@ -128,6 +128,14 @@ class RemoteTableTest < Test::Unit::TestCase
128
128
  end
129
129
 
130
130
  if ENV['ALL'] == 'true' or ENV['NEW'] == 'true'
131
+ end
132
+
133
+ if ENV['ALL'] == 'true' or ENV['FAST'] == 'true'
134
+ should "ignore UTF-8 byte order marks" do
135
+ t = RemoteTable.new :url => 'http://www.freebase.com/type/exporttypeinstances/base/horses/horse_breed?page=0&filter_mode=type&filter_view=table&show%01p%3D%2Ftype%2Fobject%2Fname%01index=0&show%01p%3D%2Fcommon%2Ftopic%2Fimage%01index=1&show%01p%3D%2Fcommon%2Ftopic%2Farticle%01index=2&sort%01p%3D%2Ftype%2Fobject%2Ftype%01p%3Dlink%01p%3D%2Ftype%2Flink%2Ftimestamp%01index=false&=&exporttype=csv-8'
136
+ assert_equal 'Tawleed', t.rows.first['name']
137
+ end
138
+
131
139
  should "be able to apply errata files" do
132
140
  t = RemoteTable.new :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-G.htm",
133
141
  :encoding => 'windows-1252',
@@ -140,9 +148,7 @@ class RemoteTableTest < Test::Unit::TestCase
140
148
  assert_equal 'GRUMMAN', g1['Manufacturer']
141
149
  assert_equal 'G159 Gulfstream I (TC4 Academe, VC4)', g1['Model']
142
150
  end
143
- end
144
-
145
- if ENV['ALL'] == 'true' or ENV['FAST'] == 'true'
151
+
146
152
  # this will die with an error about libcurl if your curl doesn't support ssl
147
153
  should "connect using HTTPS if available" do
148
154
  t = RemoteTable.new(:url => 'https://spreadsheets.google.com/pub?key=t5HM1KbaRngmTUbntg8JwPA')
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 2
8
- - 22
9
- version: 0.2.22
8
+ - 23
9
+ version: 0.2.23
10
10
  platform: ruby
11
11
  authors:
12
12
  - Seamus Abshere
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-05-17 00:00:00 -04:00
18
+ date: 2010-05-21 00:00:00 -04:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency