remote_table 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/CHANGELOG +19 -0
- data/Gemfile +7 -1
- data/README.markdown +440 -0
- data/Rakefile +6 -14
- data/lib/remote_table.rb +27 -38
- data/lib/remote_table/{properties.rb → config.rb} +39 -43
- data/lib/remote_table/format.rb +24 -27
- data/lib/remote_table/format/delimited.rb +17 -21
- data/lib/remote_table/format/fixed_width.rb +9 -9
- data/lib/remote_table/format/html.rb +0 -2
- data/lib/remote_table/format/mixins/processed_by_nokogiri.rb +13 -12
- data/lib/remote_table/format/mixins/processed_by_roo.rb +17 -13
- data/lib/remote_table/format/mixins/textual.rb +13 -13
- data/lib/remote_table/format/open_office.rb +3 -0
- data/lib/remote_table/format/xml.rb +0 -2
- data/lib/remote_table/format/yaml.rb +14 -0
- data/lib/remote_table/local_file.rb +69 -7
- data/lib/remote_table/transformer.rb +7 -4
- data/lib/remote_table/version.rb +1 -1
- data/remote_table.gemspec +5 -13
- data/test/fixtures/data.yml +4 -0
- data/test/helper.rb +8 -9
- data/test/test_big.rb +43 -53
- data/test/test_errata.rb +27 -25
- data/test/test_old_syntax.rb +193 -191
- data/test/test_old_transform.rb +12 -10
- data/test/test_remote_table.rb +57 -47
- metadata +48 -64
- data/.document +0 -5
- data/README.rdoc +0 -167
- data/lib/remote_table/utils.rb +0 -157
data/Rakefile
CHANGED
@@ -1,25 +1,17 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
3
3
|
|
4
4
|
require 'rake'
|
5
5
|
require 'rake/testtask'
|
6
6
|
Rake::TestTask.new(:test) do |test|
|
7
|
-
test.libs << '
|
7
|
+
test.libs << 'test'
|
8
8
|
test.pattern = 'test/**/test_*.rb'
|
9
9
|
test.verbose = true
|
10
10
|
end
|
11
11
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
rdoc.rdoc_dir = 'rdoc'
|
16
|
-
rdoc.title = 'remote_table'
|
17
|
-
rdoc.options << '--line-numbers' << '--inline-source'
|
18
|
-
rdoc.rdoc_files.include('README*')
|
19
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
20
|
-
end
|
21
|
-
rescue LoadError
|
22
|
-
puts "Rdoc is not available"
|
12
|
+
require 'yard'
|
13
|
+
YARD::Rake::YardocTask.new do |y|
|
14
|
+
y.options << '--no-private'
|
23
15
|
end
|
24
16
|
|
25
17
|
task :default => :test
|
data/lib/remote_table.rb
CHANGED
@@ -1,20 +1,20 @@
|
|
1
1
|
if ::RUBY_VERSION < '1.9' and $KCODE != 'UTF8'
|
2
|
-
|
2
|
+
::Kernel.warn "[remote_table] Ruby 1.8 detected, setting $KCODE to UTF8 so that ActiveSupport::Multibyte works properly."
|
3
3
|
$KCODE = 'UTF8'
|
4
4
|
end
|
5
5
|
|
6
6
|
require 'active_support'
|
7
7
|
require 'active_support/version'
|
8
|
-
|
9
|
-
active_support/core_ext
|
10
|
-
|
11
|
-
active_support/core_ext/module
|
12
|
-
active_support/core_ext/array
|
13
|
-
}.each do |active_support_3_requirement|
|
14
|
-
require active_support_3_requirement
|
15
|
-
end if ::ActiveSupport::VERSION::MAJOR == 3
|
8
|
+
if ::ActiveSupport::VERSION::MAJOR >= 3
|
9
|
+
require 'active_support/core_ext'
|
10
|
+
end
|
16
11
|
require 'hash_digest'
|
17
12
|
|
13
|
+
require 'remote_table/format'
|
14
|
+
require 'remote_table/config'
|
15
|
+
require 'remote_table/local_file'
|
16
|
+
require 'remote_table/transformer'
|
17
|
+
|
18
18
|
class Hash
|
19
19
|
attr_accessor :row_hash
|
20
20
|
end
|
@@ -23,13 +23,7 @@ class Array
|
|
23
23
|
attr_accessor :row_hash
|
24
24
|
end
|
25
25
|
|
26
|
-
class RemoteTable
|
27
|
-
autoload :Format, 'remote_table/format'
|
28
|
-
autoload :Properties, 'remote_table/properties'
|
29
|
-
autoload :LocalFile, 'remote_table/local_file'
|
30
|
-
autoload :Transformer, 'remote_table/transformer'
|
31
|
-
autoload :Utils, 'remote_table/utils'
|
32
|
-
|
26
|
+
class RemoteTable
|
33
27
|
# Legacy
|
34
28
|
class Transform
|
35
29
|
def self.row_hash(row)
|
@@ -40,7 +34,7 @@ class RemoteTable
|
|
40
34
|
include ::Enumerable
|
41
35
|
|
42
36
|
attr_reader :url
|
43
|
-
attr_reader :
|
37
|
+
attr_reader :config
|
44
38
|
|
45
39
|
# Create a new RemoteTable.
|
46
40
|
#
|
@@ -51,16 +45,16 @@ class RemoteTable
|
|
51
45
|
# Old syntax:
|
52
46
|
# RemoteTable.new(:url => 'www.customerreferenceprogram.org/uploads/CRP_RFP_template.xlsx', :foo => 'bar')
|
53
47
|
#
|
54
|
-
# See the <tt>
|
48
|
+
# See the <tt>Config</tt> object for the sorts of options you can pass.
|
55
49
|
def initialize(*args)
|
56
|
-
|
50
|
+
options = args.last.is_a?(::Hash) ? args.last.symbolize_keys : {}
|
51
|
+
|
57
52
|
@url = if args.first.is_a? ::String
|
58
53
|
args.first.dup
|
59
54
|
else
|
60
|
-
|
55
|
+
options[:url].dup
|
61
56
|
end
|
62
|
-
@
|
63
|
-
@options.freeze
|
57
|
+
@config = Config.new self, options
|
64
58
|
end
|
65
59
|
|
66
60
|
# not thread safe
|
@@ -72,17 +66,17 @@ class RemoteTable
|
|
72
66
|
retval = format.each do |row|
|
73
67
|
transformer.transform(row).each do |virtual_row|
|
74
68
|
virtual_row.row_hash = ::HashDigest.hexdigest row
|
75
|
-
if
|
76
|
-
next if
|
77
|
-
|
69
|
+
if config.errata
|
70
|
+
next if config.errata.rejects? virtual_row
|
71
|
+
config.errata.correct! virtual_row
|
78
72
|
end
|
79
|
-
next if
|
80
|
-
next if
|
81
|
-
cache.push virtual_row unless
|
73
|
+
next if config.select and !config.select.call(virtual_row)
|
74
|
+
next if config.reject and config.reject.call(virtual_row)
|
75
|
+
cache.push virtual_row unless config.streaming
|
82
76
|
yield virtual_row
|
83
77
|
end
|
84
78
|
end
|
85
|
-
fully_cached! unless
|
79
|
+
fully_cached! unless config.streaming
|
86
80
|
retval
|
87
81
|
end
|
88
82
|
end
|
@@ -117,17 +111,12 @@ class RemoteTable
|
|
117
111
|
@local_file ||= LocalFile.new self
|
118
112
|
end
|
119
113
|
|
120
|
-
# Used internally to access to the properties of the table, either set by the user or implied
|
121
|
-
def properties
|
122
|
-
@properties ||= Properties.new self
|
123
|
-
end
|
124
|
-
|
125
114
|
# Used internally to access to the driver that reads the format
|
126
115
|
def format
|
127
|
-
@format ||=
|
116
|
+
@format ||= config.format.new self
|
128
117
|
end
|
129
118
|
|
130
|
-
# Used internally to
|
119
|
+
# Used internally to access the transformer (aka parser).
|
131
120
|
def transformer
|
132
121
|
@transformer ||= Transformer.new self
|
133
122
|
end
|
@@ -139,8 +128,8 @@ class RemoteTable
|
|
139
128
|
def mark_download!
|
140
129
|
@download_count ||= 0
|
141
130
|
@download_count += 1
|
142
|
-
if
|
143
|
-
|
131
|
+
if config.warn_on_multiple_downloads and download_count > 1
|
132
|
+
::Kernel.warn "[remote_table] #{url} has been downloaded #{download_count} times."
|
144
133
|
end
|
145
134
|
end
|
146
135
|
|
@@ -1,19 +1,15 @@
|
|
1
1
|
require 'uri'
|
2
2
|
class RemoteTable
|
3
|
-
# Represents the
|
4
|
-
class
|
3
|
+
# Represents the config of a RemoteTable, whether they are explicitly set by the user or inferred automatically.
|
4
|
+
class Config
|
5
5
|
attr_reader :t
|
6
|
-
attr_reader :
|
6
|
+
attr_reader :user_specified_options
|
7
7
|
|
8
|
-
def initialize(t)
|
8
|
+
def initialize(t, user_specified_options)
|
9
9
|
@t = t
|
10
|
-
@
|
10
|
+
@user_specified_options = user_specified_options
|
11
11
|
end
|
12
|
-
|
13
|
-
def update(options)
|
14
|
-
current_options.update options
|
15
|
-
end
|
16
|
-
|
12
|
+
|
17
13
|
# The parsed URI of the file to get.
|
18
14
|
def uri
|
19
15
|
return @uri if @uri.is_a?(::URI)
|
@@ -29,19 +25,19 @@ class RemoteTable
|
|
29
25
|
# * call each
|
30
26
|
# Defaults to false.
|
31
27
|
def streaming
|
32
|
-
|
28
|
+
user_specified_options.fetch :streaming, false
|
33
29
|
end
|
34
30
|
|
35
31
|
# Defaults to true.
|
36
32
|
def warn_on_multiple_downloads
|
37
|
-
|
33
|
+
user_specified_options[:warn_on_multiple_downloads] != false
|
38
34
|
end
|
39
35
|
|
40
36
|
# The headers specified by the user
|
41
37
|
#
|
42
38
|
# Default: :first_row
|
43
39
|
def headers
|
44
|
-
|
40
|
+
user_specified_options[:headers].nil? ? :first_row : user_specified_options[:headers]
|
45
41
|
end
|
46
42
|
|
47
43
|
def use_first_row_as_header?
|
@@ -53,33 +49,31 @@ class RemoteTable
|
|
53
49
|
end
|
54
50
|
|
55
51
|
# The sheet specified by the user as a number or a string
|
56
|
-
#
|
57
|
-
# Default: 0
|
58
52
|
def sheet
|
59
|
-
|
53
|
+
user_specified_options[:sheet]
|
60
54
|
end
|
61
55
|
|
62
56
|
# Whether to keep blank rows
|
63
57
|
#
|
64
58
|
# Default: false
|
65
59
|
def keep_blank_rows
|
66
|
-
|
60
|
+
user_specified_options.fetch :keep_blank_rows, false
|
67
61
|
end
|
68
62
|
|
69
63
|
# Form data to send in with the download request
|
70
64
|
def form_data
|
71
|
-
|
65
|
+
user_specified_options[:form_data]
|
72
66
|
end
|
73
67
|
|
74
68
|
# How many rows to skip
|
75
69
|
#
|
76
70
|
# Default: 0
|
77
71
|
def skip
|
78
|
-
|
72
|
+
user_specified_options.fetch :skip, 0
|
79
73
|
end
|
80
74
|
|
81
75
|
def internal_encoding
|
82
|
-
(
|
76
|
+
user_specified_options.fetch(:encoding, 'UTF-8').upcase
|
83
77
|
end
|
84
78
|
|
85
79
|
def external_encoding
|
@@ -94,27 +88,27 @@ class RemoteTable
|
|
94
88
|
#
|
95
89
|
# Default: ","
|
96
90
|
def delimiter
|
97
|
-
|
91
|
+
user_specified_options.fetch :delimiter, ','
|
98
92
|
end
|
99
93
|
|
100
94
|
# The XPath used to find rows
|
101
95
|
def row_xpath
|
102
|
-
|
96
|
+
user_specified_options[:row_xpath]
|
103
97
|
end
|
104
98
|
|
105
99
|
# The XPath used to find columns
|
106
100
|
def column_xpath
|
107
|
-
|
101
|
+
user_specified_options[:column_xpath]
|
108
102
|
end
|
109
103
|
|
110
104
|
# The CSS selector used to find rows
|
111
105
|
def row_css
|
112
|
-
|
106
|
+
user_specified_options[:row_css]
|
113
107
|
end
|
114
108
|
|
115
109
|
# The CSS selector used to find columns
|
116
110
|
def column_css
|
117
|
-
|
111
|
+
user_specified_options[:column_css]
|
118
112
|
end
|
119
113
|
|
120
114
|
# The compression type.
|
@@ -123,8 +117,8 @@ class RemoteTable
|
|
123
117
|
#
|
124
118
|
# Can be specified as: :gz, :zip, :bz2, :exe (treated as :zip)
|
125
119
|
def compression
|
126
|
-
if
|
127
|
-
return
|
120
|
+
if user_specified_options.has_key?(:compression)
|
121
|
+
return user_specified_options[:compression]
|
128
122
|
end
|
129
123
|
case ::File.extname(uri.path).downcase
|
130
124
|
when /gz/, /gunzip/
|
@@ -144,8 +138,8 @@ class RemoteTable
|
|
144
138
|
#
|
145
139
|
# Can be specified as: :tar
|
146
140
|
def packing
|
147
|
-
if
|
148
|
-
return
|
141
|
+
if user_specified_options.has_key?(:packing)
|
142
|
+
return user_specified_options[:packing]
|
149
143
|
end
|
150
144
|
if uri.path =~ %r{\.tar(?:\.|$)}i
|
151
145
|
:tar
|
@@ -157,7 +151,7 @@ class RemoteTable
|
|
157
151
|
# Example:
|
158
152
|
# RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :glob => '/*.csv'
|
159
153
|
def glob
|
160
|
-
|
154
|
+
user_specified_options[:glob]
|
161
155
|
end
|
162
156
|
|
163
157
|
# The filename, which can be used to pick a file out of an archive.
|
@@ -165,17 +159,17 @@ class RemoteTable
|
|
165
159
|
# Example:
|
166
160
|
# RemoteTable.new 'http://www.fueleconomy.gov/FEG/epadata/08data.zip', :filename => '2008_FE_guide_ALL_rel_dates_-no sales-for DOE-5-1-08.csv'
|
167
161
|
def filename
|
168
|
-
|
162
|
+
user_specified_options[:filename]
|
169
163
|
end
|
170
164
|
|
171
165
|
# Cut columns up to this character
|
172
166
|
def cut
|
173
|
-
|
167
|
+
user_specified_options[:cut]
|
174
168
|
end
|
175
169
|
|
176
170
|
# Crop rows after this line
|
177
171
|
def crop
|
178
|
-
|
172
|
+
user_specified_options[:crop]
|
179
173
|
end
|
180
174
|
|
181
175
|
# The fixed-width schema, given as an array
|
@@ -190,31 +184,31 @@ class RemoteTable
|
|
190
184
|
# [ 'spacer', 12 ],
|
191
185
|
# [ 'header6', 10, { :type => :string } ]])
|
192
186
|
def schema
|
193
|
-
|
187
|
+
user_specified_options[:schema]
|
194
188
|
end
|
195
189
|
|
196
190
|
# The name of the fixed-width schema according to FixedWidth
|
197
191
|
def schema_name
|
198
|
-
|
192
|
+
user_specified_options[:schema_name]
|
199
193
|
end
|
200
194
|
|
201
195
|
# A proc to call to decide whether to return a row.
|
202
196
|
def select
|
203
|
-
|
197
|
+
user_specified_options[:select]
|
204
198
|
end
|
205
199
|
|
206
200
|
# A proc to call to decide whether to return a row.
|
207
201
|
def reject
|
208
|
-
|
202
|
+
user_specified_options[:reject]
|
209
203
|
end
|
210
204
|
|
211
205
|
# A hash of options to create a new Errata instance (see the Errata gem at http://github.com/seamusabshere/errata) to be used on every row.
|
212
206
|
def errata
|
213
|
-
return unless
|
214
|
-
@errata ||= if
|
215
|
-
::Errata.new
|
207
|
+
return unless user_specified_options.has_key? :errata
|
208
|
+
@errata ||= if user_specified_options[:errata].is_a? ::Hash
|
209
|
+
::Errata.new user_specified_options[:errata]
|
216
210
|
else
|
217
|
-
|
211
|
+
user_specified_options[:errata]
|
218
212
|
end
|
219
213
|
end
|
220
214
|
|
@@ -227,8 +221,8 @@ class RemoteTable
|
|
227
221
|
# Can be specified as: :xlsx, :xls, :delimited (aka :csv and :tsv), :ods, :fixed_width, :html
|
228
222
|
def format
|
229
223
|
return Format::Delimited if uri.host == 'spreadsheets.google.com' or @uri.host == 'docs.google.com'
|
230
|
-
clue = if
|
231
|
-
|
224
|
+
clue = if user_specified_options.has_key?(:format)
|
225
|
+
user_specified_options[:format]
|
232
226
|
else
|
233
227
|
t.local_file.path
|
234
228
|
end
|
@@ -247,6 +241,8 @@ class RemoteTable
|
|
247
241
|
Format::HTML
|
248
242
|
when /xml/
|
249
243
|
Format::XML
|
244
|
+
when /yaml/, /yml/
|
245
|
+
Format::Yaml
|
250
246
|
else
|
251
247
|
Format::Delimited
|
252
248
|
end
|
data/lib/remote_table/format.rb
CHANGED
@@ -1,22 +1,22 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
require 'iconv'
|
2
|
+
if RUBY_VERSION >= '1.9'
|
3
|
+
# for an excellent explanation see http://blog.segment7.net/2010/12/17/from-iconv-iconv-to-string-encode
|
4
|
+
Kernel.warn "[remote_table] Apologies - using iconv because Ruby 1.9.x's String#encode doesn't have transliteration tables (yet)"
|
5
5
|
end
|
6
6
|
|
7
|
+
require 'remote_table/format/mixins/textual'
|
8
|
+
require 'remote_table/format/mixins/processed_by_roo'
|
9
|
+
require 'remote_table/format/mixins/processed_by_nokogiri'
|
10
|
+
require 'remote_table/format/excel'
|
11
|
+
require 'remote_table/format/excelx'
|
12
|
+
require 'remote_table/format/delimited'
|
13
|
+
require 'remote_table/format/open_office'
|
14
|
+
require 'remote_table/format/fixed_width'
|
15
|
+
require 'remote_table/format/html'
|
16
|
+
require 'remote_table/format/xml'
|
17
|
+
require 'remote_table/format/yaml'
|
7
18
|
class RemoteTable
|
8
19
|
class Format
|
9
|
-
autoload :Excel, 'remote_table/format/excel'
|
10
|
-
autoload :Excelx, 'remote_table/format/excelx'
|
11
|
-
autoload :Delimited, 'remote_table/format/delimited'
|
12
|
-
autoload :OpenOffice, 'remote_table/format/open_office'
|
13
|
-
autoload :FixedWidth, 'remote_table/format/fixed_width'
|
14
|
-
autoload :HTML, 'remote_table/format/html'
|
15
|
-
autoload :XML, 'remote_table/format/xml'
|
16
|
-
|
17
|
-
autoload :Textual, 'remote_table/format/mixins/textual'
|
18
|
-
autoload :ProcessedByRoo, 'remote_table/format/mixins/processed_by_roo'
|
19
|
-
autoload :ProcessedByNokogiri, 'remote_table/format/mixins/processed_by_nokogiri'
|
20
20
|
|
21
21
|
attr_reader :t
|
22
22
|
|
@@ -25,28 +25,25 @@ class RemoteTable
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def transliterate_to_utf8(str)
|
28
|
-
|
29
|
-
|
30
|
-
transliterated_str = if ::RUBY_VERSION >= '1.9'
|
31
|
-
str.ensure_encoding t.properties.external_encoding, :external_encoding => t.properties.internal_encoding, :invalid_characters => :transcode
|
32
|
-
else
|
33
|
-
::Iconv.conv(t.properties.external_encoding_iconv, t.properties.internal_encoding, str.to_s + ' ')[0..-2]
|
28
|
+
if str.is_a?(::String)
|
29
|
+
[ iconv.iconv(str), iconv.iconv(nil) ].join
|
34
30
|
end
|
35
|
-
$stderr.puts "[remote_table translit] After: #{transliterated_str}" if ::ENV['REMOTE_TABLE_DEBUG'] and ::ENV['REMOTE_TABLE_DEBUG'].include?('translit')
|
36
|
-
transliterated_str
|
37
31
|
end
|
38
32
|
|
39
33
|
def assume_utf8(str)
|
40
34
|
if str.is_a?(::String) and ::RUBY_VERSION >= '1.9'
|
41
|
-
str.encode! t.
|
35
|
+
str.encode! t.config.external_encoding
|
42
36
|
else
|
43
37
|
str
|
44
38
|
end
|
45
39
|
end
|
46
40
|
|
47
|
-
|
48
|
-
|
49
|
-
|
41
|
+
private
|
42
|
+
|
43
|
+
def iconv
|
44
|
+
@iconv ||= ::Iconv.new(t.config.external_encoding_iconv, t.config.internal_encoding)
|
50
45
|
end
|
46
|
+
|
47
|
+
include ::Enumerable
|
51
48
|
end
|
52
49
|
end
|