remote_table 1.3.0 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/CHANGELOG +19 -0
- data/Gemfile +7 -1
- data/README.markdown +440 -0
- data/Rakefile +6 -14
- data/lib/remote_table.rb +27 -38
- data/lib/remote_table/{properties.rb → config.rb} +39 -43
- data/lib/remote_table/format.rb +24 -27
- data/lib/remote_table/format/delimited.rb +17 -21
- data/lib/remote_table/format/fixed_width.rb +9 -9
- data/lib/remote_table/format/html.rb +0 -2
- data/lib/remote_table/format/mixins/processed_by_nokogiri.rb +13 -12
- data/lib/remote_table/format/mixins/processed_by_roo.rb +17 -13
- data/lib/remote_table/format/mixins/textual.rb +13 -13
- data/lib/remote_table/format/open_office.rb +3 -0
- data/lib/remote_table/format/xml.rb +0 -2
- data/lib/remote_table/format/yaml.rb +14 -0
- data/lib/remote_table/local_file.rb +69 -7
- data/lib/remote_table/transformer.rb +7 -4
- data/lib/remote_table/version.rb +1 -1
- data/remote_table.gemspec +5 -13
- data/test/fixtures/data.yml +4 -0
- data/test/helper.rb +8 -9
- data/test/test_big.rb +43 -53
- data/test/test_errata.rb +27 -25
- data/test/test_old_syntax.rb +193 -191
- data/test/test_old_transform.rb +12 -10
- data/test/test_remote_table.rb +57 -47
- metadata +48 -64
- data/.document +0 -5
- data/README.rdoc +0 -167
- data/lib/remote_table/utils.rb +0 -157
data/Rakefile
CHANGED
@@ -1,25 +1,17 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
3
3
|
|
4
4
|
require 'rake'
|
5
5
|
require 'rake/testtask'
|
6
6
|
Rake::TestTask.new(:test) do |test|
|
7
|
-
test.libs << '
|
7
|
+
test.libs << 'test'
|
8
8
|
test.pattern = 'test/**/test_*.rb'
|
9
9
|
test.verbose = true
|
10
10
|
end
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
rdoc.rdoc_dir = 'rdoc'
|
16
|
-
rdoc.title = 'remote_table'
|
17
|
-
rdoc.options << '--line-numbers' << '--inline-source'
|
18
|
-
rdoc.rdoc_files.include('README*')
|
19
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
20
|
-
end
|
21
|
-
rescue LoadError
|
22
|
-
puts "Rdoc is not available"
|
12
|
+
require 'yard'
|
13
|
+
YARD::Rake::YardocTask.new do |y|
|
14
|
+
y.options << '--no-private'
|
23
15
|
end
|
24
16
|
|
25
17
|
task :default => :test
|
data/lib/remote_table.rb
CHANGED
@@ -1,20 +1,20 @@
|
|
1
1
|
if ::RUBY_VERSION < '1.9' and $KCODE != 'UTF8'
|
2
|
-
|
2
|
+
::Kernel.warn "[remote_table] Ruby 1.8 detected, setting $KCODE to UTF8 so that ActiveSupport::Multibyte works properly."
|
3
3
|
$KCODE = 'UTF8'
|
4
4
|
end
|
5
5
|
|
6
6
|
require 'active_support'
|
7
7
|
require 'active_support/version'
|
8
|
-
|
9
|
-
active_support/core_ext
|
10
|
-
|
11
|
-
active_support/core_ext/module
|
12
|
-
active_support/core_ext/array
|
13
|
-
}.each do |active_support_3_requirement|
|
14
|
-
require active_support_3_requirement
|
15
|
-
end if ::ActiveSupport::VERSION::MAJOR == 3
|
8
|
+
if ::ActiveSupport::VERSION::MAJOR >= 3
|
9
|
+
require 'active_support/core_ext'
|
10
|
+
end
|
16
11
|
require 'hash_digest'
|
17
12
|
|
13
|
+
require 'remote_table/format'
|
14
|
+
require 'remote_table/config'
|
15
|
+
require 'remote_table/local_file'
|
16
|
+
require 'remote_table/transformer'
|
17
|
+
|
18
18
|
class Hash
|
19
19
|
attr_accessor :row_hash
|
20
20
|
end
|
@@ -23,13 +23,7 @@ class Array
|
|
23
23
|
attr_accessor :row_hash
|
24
24
|
end
|
25
25
|
|
26
|
-
class RemoteTable
|
27
|
-
autoload :Format, 'remote_table/format'
|
28
|
-
autoload :Properties, 'remote_table/properties'
|
29
|
-
autoload :LocalFile, 'remote_table/local_file'
|
30
|
-
autoload :Transformer, 'remote_table/transformer'
|
31
|
-
autoload :Utils, 'remote_table/utils'
|
32
|
-
|
26
|
+
class RemoteTable
|
33
27
|
# Legacy
|
34
28
|
class Transform
|
35
29
|
def self.row_hash(row)
|
@@ -40,7 +34,7 @@ class RemoteTable
|
|
40
34
|
include ::Enumerable
|
41
35
|
|
42
36
|
attr_reader :url
|
43
|
-
attr_reader :
|
37
|
+
attr_reader :config
|
44
38
|
|
45
39
|
# Create a new RemoteTable.
|
46
40
|
#
|
@@ -51,16 +45,16 @@ class RemoteTable
|
|
51
45
|
# Old syntax:
|
52
46
|
# RemoteTable.new(:url => 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :foo => 'bar')
|
53
47
|
#
|
54
|
-
# See the <tt>
|
48
|
+
# See the <tt>Config</tt> object for the sorts of options you can pass.
|
55
49
|
def initialize(*args)
|
56
|
-
|
50
|
+
options = args.last.is_a?(::Hash) ? args.last.symbolize_keys : {}
|
51
|
+
|
57
52
|
@url = if args.first.is_a? ::String
|
58
53
|
args.first.dup
|
59
54
|
else
|
60
|
-
|
55
|
+
options[:url].dup
|
61
56
|
end
|
62
|
-
@
|
63
|
-
@options.freeze
|
57
|
+
@config = Config.new self, options
|
64
58
|
end
|
65
59
|
|
66
60
|
# not thread safe
|
@@ -72,17 +66,17 @@ class RemoteTable
|
|
72
66
|
retval = format.each do |row|
|
73
67
|
transformer.transform(row).each do |virtual_row|
|
74
68
|
virtual_row.row_hash = ::HashDigest.hexdigest row
|
75
|
-
if
|
76
|
-
next if
|
77
|
-
|
69
|
+
if config.errata
|
70
|
+
next if config.errata.rejects? virtual_row
|
71
|
+
config.errata.correct! virtual_row
|
78
72
|
end
|
79
|
-
next if
|
80
|
-
next if
|
81
|
-
cache.push virtual_row unless
|
73
|
+
next if config.select and !config.select.call(virtual_row)
|
74
|
+
next if config.reject and config.reject.call(virtual_row)
|
75
|
+
cache.push virtual_row unless config.streaming
|
82
76
|
yield virtual_row
|
83
77
|
end
|
84
78
|
end
|
85
|
-
fully_cached! unless
|
79
|
+
fully_cached! unless config.streaming
|
86
80
|
retval
|
87
81
|
end
|
88
82
|
end
|
@@ -117,17 +111,12 @@ class RemoteTable
|
|
117
111
|
@local_file ||= LocalFile.new self
|
118
112
|
end
|
119
113
|
|
120
|
-
# Used internally to access to the properties of the table, either set by the user or implied
|
121
|
-
def properties
|
122
|
-
@properties ||= Properties.new self
|
123
|
-
end
|
124
|
-
|
125
114
|
# Used internally to access to the driver that reads the format
|
126
115
|
def format
|
127
|
-
@format ||=
|
116
|
+
@format ||= config.format.new self
|
128
117
|
end
|
129
118
|
|
130
|
-
# Used internally to
|
119
|
+
# Used internally to access the transformer (aka parser).
|
131
120
|
def transformer
|
132
121
|
@transformer ||= Transformer.new self
|
133
122
|
end
|
@@ -139,8 +128,8 @@ class RemoteTable
|
|
139
128
|
def mark_download!
|
140
129
|
@download_count ||= 0
|
141
130
|
@download_count += 1
|
142
|
-
if
|
143
|
-
|
131
|
+
if config.warn_on_multiple_downloads and download_count > 1
|
132
|
+
::Kernel.warn "[remote_table] #{url} has been downloaded #{download_count} times."
|
144
133
|
end
|
145
134
|
end
|
146
135
|
|
@@ -1,19 +1,15 @@
|
|
1
1
|
require 'uri'
|
2
2
|
class RemoteTable
|
3
|
-
# Represents the
|
4
|
-
class
|
3
|
+
# Represents the config of a RemoteTable, whether they are explicitly set by the user or inferred automatically.
|
4
|
+
class Config
|
5
5
|
attr_reader :t
|
6
|
-
attr_reader :
|
6
|
+
attr_reader :user_specified_options
|
7
7
|
|
8
|
-
def initialize(t)
|
8
|
+
def initialize(t, user_specified_options)
|
9
9
|
@t = t
|
10
|
-
@
|
10
|
+
@user_specified_options = user_specified_options
|
11
11
|
end
|
12
|
-
|
13
|
-
def update(options)
|
14
|
-
current_options.update options
|
15
|
-
end
|
16
|
-
|
12
|
+
|
17
13
|
# The parsed URI of the file to get.
|
18
14
|
def uri
|
19
15
|
return @uri if @uri.is_a?(::URI)
|
@@ -29,19 +25,19 @@ class RemoteTable
|
|
29
25
|
# * call each
|
30
26
|
# Defaults to false.
|
31
27
|
def streaming
|
32
|
-
|
28
|
+
user_specified_options.fetch :streaming, false
|
33
29
|
end
|
34
30
|
|
35
31
|
# Defaults to true.
|
36
32
|
def warn_on_multiple_downloads
|
37
|
-
|
33
|
+
user_specified_options[:warn_on_multiple_downloads] != false
|
38
34
|
end
|
39
35
|
|
40
36
|
# The headers specified by the user
|
41
37
|
#
|
42
38
|
# Default: :first_row
|
43
39
|
def headers
|
44
|
-
|
40
|
+
user_specified_options[:headers].nil? ? :first_row : user_specified_options[:headers]
|
45
41
|
end
|
46
42
|
|
47
43
|
def use_first_row_as_header?
|
@@ -53,33 +49,31 @@ class RemoteTable
|
|
53
49
|
end
|
54
50
|
|
55
51
|
# The sheet specified by the user as a number or a string
|
56
|
-
#
|
57
|
-
# Default: 0
|
58
52
|
def sheet
|
59
|
-
|
53
|
+
user_specified_options[:sheet]
|
60
54
|
end
|
61
55
|
|
62
56
|
# Whether to keep blank rows
|
63
57
|
#
|
64
58
|
# Default: false
|
65
59
|
def keep_blank_rows
|
66
|
-
|
60
|
+
user_specified_options.fetch :keep_blank_rows, false
|
67
61
|
end
|
68
62
|
|
69
63
|
# Form data to send in with the download request
|
70
64
|
def form_data
|
71
|
-
|
65
|
+
user_specified_options[:form_data]
|
72
66
|
end
|
73
67
|
|
74
68
|
# How many rows to skip
|
75
69
|
#
|
76
70
|
# Default: 0
|
77
71
|
def skip
|
78
|
-
|
72
|
+
user_specified_options.fetch :skip, 0
|
79
73
|
end
|
80
74
|
|
81
75
|
def internal_encoding
|
82
|
-
(
|
76
|
+
user_specified_options.fetch(:encoding, 'UTF-8').upcase
|
83
77
|
end
|
84
78
|
|
85
79
|
def external_encoding
|
@@ -94,27 +88,27 @@ class RemoteTable
|
|
94
88
|
#
|
95
89
|
# Default: ","
|
96
90
|
def delimiter
|
97
|
-
|
91
|
+
user_specified_options.fetch :delimiter, ','
|
98
92
|
end
|
99
93
|
|
100
94
|
# The XPath used to find rows
|
101
95
|
def row_xpath
|
102
|
-
|
96
|
+
user_specified_options[:row_xpath]
|
103
97
|
end
|
104
98
|
|
105
99
|
# The XPath used to find columns
|
106
100
|
def column_xpath
|
107
|
-
|
101
|
+
user_specified_options[:column_xpath]
|
108
102
|
end
|
109
103
|
|
110
104
|
# The CSS selector used to find rows
|
111
105
|
def row_css
|
112
|
-
|
106
|
+
user_specified_options[:row_css]
|
113
107
|
end
|
114
108
|
|
115
109
|
# The CSS selector used to find columns
|
116
110
|
def column_css
|
117
|
-
|
111
|
+
user_specified_options[:column_css]
|
118
112
|
end
|
119
113
|
|
120
114
|
# The compression type.
|
@@ -123,8 +117,8 @@ class RemoteTable
|
|
123
117
|
#
|
124
118
|
# Can be specified as: :gz, :zip, :bz2, :exe (treated as :zip)
|
125
119
|
def compression
|
126
|
-
if
|
127
|
-
return
|
120
|
+
if user_specified_options.has_key?(:compression)
|
121
|
+
return user_specified_options[:compression]
|
128
122
|
end
|
129
123
|
case ::File.extname(uri.path).downcase
|
130
124
|
when /gz/, /gunzip/
|
@@ -144,8 +138,8 @@ class RemoteTable
|
|
144
138
|
#
|
145
139
|
# Can be specified as: :tar
|
146
140
|
def packing
|
147
|
-
if
|
148
|
-
return
|
141
|
+
if user_specified_options.has_key?(:packing)
|
142
|
+
return user_specified_options[:packing]
|
149
143
|
end
|
150
144
|
if uri.path =~ %r{\.tar(?:\.|$)}i
|
151
145
|
:tar
|
@@ -157,7 +151,7 @@ class RemoteTable
|
|
157
151
|
# Example:
|
158
152
|
# RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :glob => '/*.csv'
|
159
153
|
def glob
|
160
|
-
|
154
|
+
user_specified_options[:glob]
|
161
155
|
end
|
162
156
|
|
163
157
|
# The filename, which can be used to pick a file out of an archive.
|
@@ -165,17 +159,17 @@ class RemoteTable
|
|
165
159
|
# Example:
|
166
160
|
# RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv'
|
167
161
|
def filename
|
168
|
-
|
162
|
+
user_specified_options[:filename]
|
169
163
|
end
|
170
164
|
|
171
165
|
# Cut columns up to this character
|
172
166
|
def cut
|
173
|
-
|
167
|
+
user_specified_options[:cut]
|
174
168
|
end
|
175
169
|
|
176
170
|
# Crop rows after this line
|
177
171
|
def crop
|
178
|
-
|
172
|
+
user_specified_options[:crop]
|
179
173
|
end
|
180
174
|
|
181
175
|
# The fixed-width schema, given as an array
|
@@ -190,31 +184,31 @@ class RemoteTable
|
|
190
184
|
# [ 'spacer', 12 ],
|
191
185
|
# [ 'header6', 10, { :type => :string } ]])
|
192
186
|
def schema
|
193
|
-
|
187
|
+
user_specified_options[:schema]
|
194
188
|
end
|
195
189
|
|
196
190
|
# The name of the fixed-width schema according to FixedWidth
|
197
191
|
def schema_name
|
198
|
-
|
192
|
+
user_specified_options[:schema_name]
|
199
193
|
end
|
200
194
|
|
201
195
|
# A proc to call to decide whether to return a row.
|
202
196
|
def select
|
203
|
-
|
197
|
+
user_specified_options[:select]
|
204
198
|
end
|
205
199
|
|
206
200
|
# A proc to call to decide whether to return a row.
|
207
201
|
def reject
|
208
|
-
|
202
|
+
user_specified_options[:reject]
|
209
203
|
end
|
210
204
|
|
211
205
|
# A hash of options to create a new Errata instance (see the Errata gem at http://github.com/seamusabshere/errata) to be used on every row.
|
212
206
|
def errata
|
213
|
-
return unless
|
214
|
-
@errata ||= if
|
215
|
-
::Errata.new
|
207
|
+
return unless user_specified_options.has_key? :errata
|
208
|
+
@errata ||= if user_specified_options[:errata].is_a? ::Hash
|
209
|
+
::Errata.new user_specified_options[:errata]
|
216
210
|
else
|
217
|
-
|
211
|
+
user_specified_options[:errata]
|
218
212
|
end
|
219
213
|
end
|
220
214
|
|
@@ -227,8 +221,8 @@ class RemoteTable
|
|
227
221
|
# Can be specified as: :xlsx, :xls, :delimited (aka :csv and :tsv), :ods, :fixed_width, :html
|
228
222
|
def format
|
229
223
|
return Format::Delimited if uri.host == 'spreadsheets.google.com' or @uri.host == 'docs.google.com'
|
230
|
-
clue = if
|
231
|
-
|
224
|
+
clue = if user_specified_options.has_key?(:format)
|
225
|
+
user_specified_options[:format]
|
232
226
|
else
|
233
227
|
t.local_file.path
|
234
228
|
end
|
@@ -247,6 +241,8 @@ class RemoteTable
|
|
247
241
|
Format::HTML
|
248
242
|
when /xml/
|
249
243
|
Format::XML
|
244
|
+
when /yaml/, /yml/
|
245
|
+
Format::Yaml
|
250
246
|
else
|
251
247
|
Format::Delimited
|
252
248
|
end
|
data/lib/remote_table/format.rb
CHANGED
@@ -1,22 +1,22 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
require 'iconv'
|
2
|
+
if RUBY_VERSION >= '1.9'
|
3
|
+
# for an excellent explanation see http://blog.segment7.net/2010/12/17/from-iconv-iconv-to-string-encode
|
4
|
+
Kernel.warn "[remote_table] Apologies - using iconv because Ruby 1.9.x's String#encode doesn't have transliteration tables (yet)"
|
5
5
|
end
|
6
6
|
|
7
|
+
require 'remote_table/format/mixins/textual'
|
8
|
+
require 'remote_table/format/mixins/processed_by_roo'
|
9
|
+
require 'remote_table/format/mixins/processed_by_nokogiri'
|
10
|
+
require 'remote_table/format/excel'
|
11
|
+
require 'remote_table/format/excelx'
|
12
|
+
require 'remote_table/format/delimited'
|
13
|
+
require 'remote_table/format/open_office'
|
14
|
+
require 'remote_table/format/fixed_width'
|
15
|
+
require 'remote_table/format/html'
|
16
|
+
require 'remote_table/format/xml'
|
17
|
+
require 'remote_table/format/yaml'
|
7
18
|
class RemoteTable
|
8
19
|
class Format
|
9
|
-
autoload :Excel, 'remote_table/format/excel'
|
10
|
-
autoload :Excelx, 'remote_table/format/excelx'
|
11
|
-
autoload :Delimited, 'remote_table/format/delimited'
|
12
|
-
autoload :OpenOffice, 'remote_table/format/open_office'
|
13
|
-
autoload :FixedWidth, 'remote_table/format/fixed_width'
|
14
|
-
autoload :HTML, 'remote_table/format/html'
|
15
|
-
autoload :XML, 'remote_table/format/xml'
|
16
|
-
|
17
|
-
autoload :Textual, 'remote_table/format/mixins/textual'
|
18
|
-
autoload :ProcessedByRoo, 'remote_table/format/mixins/processed_by_roo'
|
19
|
-
autoload :ProcessedByNokogiri, 'remote_table/format/mixins/processed_by_nokogiri'
|
20
20
|
|
21
21
|
attr_reader :t
|
22
22
|
|
@@ -25,28 +25,25 @@ class RemoteTable
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def transliterate_to_utf8(str)
|
28
|
-
|
29
|
-
|
30
|
-
transliterated_str = if ::RUBY_VERSION >= '1.9'
|
31
|
-
str.ensure_encoding t.properties.external_encoding, :external_encoding => t.properties.internal_encoding, :invalid_characters => :transcode
|
32
|
-
else
|
33
|
-
::Iconv.conv(t.properties.external_encoding_iconv, t.properties.internal_encoding, str.to_s + ' ')[0..-2]
|
28
|
+
if str.is_a?(::String)
|
29
|
+
[ iconv.iconv(str), iconv.iconv(nil) ].join
|
34
30
|
end
|
35
|
-
$stderr.puts "[remote_table translit] After: #{transliterated_str}" if ::ENV['REMOTE_TABLE_DEBUG'] and ::ENV['REMOTE_TABLE_DEBUG'].include?('translit')
|
36
|
-
transliterated_str
|
37
31
|
end
|
38
32
|
|
39
33
|
def assume_utf8(str)
|
40
34
|
if str.is_a?(::String) and ::RUBY_VERSION >= '1.9'
|
41
|
-
str.encode! t.
|
35
|
+
str.encode! t.config.external_encoding
|
42
36
|
else
|
43
37
|
str
|
44
38
|
end
|
45
39
|
end
|
46
40
|
|
47
|
-
|
48
|
-
|
49
|
-
|
41
|
+
private
|
42
|
+
|
43
|
+
def iconv
|
44
|
+
@iconv ||= ::Iconv.new(t.config.external_encoding_iconv, t.config.internal_encoding)
|
50
45
|
end
|
46
|
+
|
47
|
+
include ::Enumerable
|
51
48
|
end
|
52
49
|
end
|