scraperwiki 1.0.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/scraperwiki.rb +75 -431
- metadata +24 -77
- data/lib/scraperwiki/datastore.rb +0 -109
- data/lib/scraperwiki/stacktrace.rb +0 -51
- data/lib/version.rb +0 -4
data/lib/scraperwiki.rb
CHANGED
|
@@ -1,480 +1,124 @@
|
|
|
1
|
-
require '
|
|
2
|
-
|
|
3
|
-
require
|
|
4
|
-
require 'scraperwiki/datastore'
|
|
5
|
-
require 'httpclient'
|
|
6
|
-
|
|
7
|
-
class SqliteException < RuntimeError
|
|
8
|
-
end
|
|
9
|
-
|
|
10
|
-
class NoSuchTableSqliteException < SqliteException
|
|
11
|
-
end
|
|
12
|
-
|
|
13
|
-
$apiwrapperattacheddata = [ ]
|
|
1
|
+
require 'sqlite3'
|
|
2
|
+
$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__)))
|
|
3
|
+
require 'scraperwiki/sqlite_save_info.rb'
|
|
14
4
|
|
|
15
5
|
module ScraperWiki
|
|
16
6
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
7
|
+
# The scrape method fetches the content from a webserver.
|
|
8
|
+
#
|
|
9
|
+
# === Parameters
|
|
10
|
+
#
|
|
11
|
+
# * _url_ = The URL to fetch
|
|
12
|
+
# * _params_ = The parameters to send with a POST request
|
|
13
|
+
# * _agent = A manually supplied useragent string
|
|
14
|
+
#
|
|
15
|
+
# === Example
|
|
16
|
+
# ScraperWiki::scrape('http://scraperwiki.com')
|
|
17
|
+
#
|
|
18
|
+
def ScraperWiki.scrape(url, params = nil, agent = nil)
|
|
19
|
+
if agent
|
|
20
|
+
client = HTTPClient.new(:agent_name => agent)
|
|
21
|
+
else
|
|
22
|
+
client = HTTPClient.new
|
|
23
|
+
end
|
|
32
24
|
client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
25
|
+
if HTTPClient.respond_to?("client.transparent_gzip_decompression=")
|
|
26
|
+
client.transparent_gzip_decompression = true
|
|
27
|
+
end
|
|
33
28
|
|
|
34
|
-
if params.nil?
|
|
35
|
-
|
|
29
|
+
if params.nil?
|
|
30
|
+
html = client.get_content(url)
|
|
36
31
|
else
|
|
37
|
-
|
|
32
|
+
html = client.post_content(url, params)
|
|
38
33
|
end
|
|
39
|
-
end
|
|
40
34
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
35
|
+
unless HTTPClient.respond_to?("client.transparent_gzip_decompression=")
|
|
36
|
+
begin
|
|
37
|
+
gz = Zlib::GzipReader.new(StringIO.new(html))
|
|
38
|
+
return gz.read
|
|
39
|
+
rescue
|
|
40
|
+
return html
|
|
47
41
|
end
|
|
48
|
-
|
|
42
|
+
end
|
|
49
43
|
end
|
|
50
44
|
|
|
45
|
+
# Saves the provided data into a local database for this scraper. Data is upserted
|
|
46
|
+
# into this table (inserted if it does not exist, updated if the unique keys say it
|
|
47
|
+
# does).
|
|
48
|
+
#
|
|
49
|
+
# === Parameters
|
|
50
|
+
#
|
|
51
|
+
# * _unique_keys_ = A list of column names, that used together should be unique
|
|
52
|
+
# * _data_ = A hash of the data where the Key is the column name, the Value the row
|
|
53
|
+
# value. If sending lots of data this can be a list of hashes.
|
|
54
|
+
# * _table_name_ = The name that the newly created table should use.
|
|
55
|
+
#
|
|
56
|
+
# === Example
|
|
57
|
+
# ScraperWiki::save(['id'], {'id'=>1})
|
|
58
|
+
#
|
|
59
|
+
def ScraperWiki.save_sqlite(unique_keys, data, table_name="swdata")
|
|
60
|
+
raise 'unique_keys must be nil or an array' if unique_keys != nil && !unique_keys.kind_of?(Array)
|
|
61
|
+
raise 'data must have a non-nil value' if data == nil
|
|
51
62
|
|
|
63
|
+
# convert :symbols to "strings"
|
|
64
|
+
unique_keys = unique_keys.map { |x| x.kind_of?(Symbol) ? x.to_s : x }
|
|
52
65
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def ScraperWiki.save(unique_keys, data, date=nil, latlng=nil, table_name="swdata")
|
|
59
|
-
if unique_keys != nil && !unique_keys.kind_of?(Array)
|
|
60
|
-
raise 'unique_keys must be nil or an array'
|
|
61
|
-
end
|
|
62
|
-
if data == nil
|
|
63
|
-
raise 'data must have a non-nil value'
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
ds = SW_DataStore.create()
|
|
67
|
-
ldata = data.dup
|
|
68
|
-
if date != nil
|
|
69
|
-
ldata["date"] = date
|
|
70
|
-
end
|
|
71
|
-
if latlng != nil
|
|
72
|
-
ldata["latlng_lat"] = latlng[0]
|
|
73
|
-
ldata["latlng_lng"] = latlng[1]
|
|
66
|
+
if data.class == Hash
|
|
67
|
+
data = [ data ]
|
|
68
|
+
elsif data.length == 0
|
|
69
|
+
return
|
|
74
70
|
end
|
|
75
|
-
return ScraperWiki.save_sqlite(unique_keys, ldata, table_name="swdata", verbose=2)
|
|
76
|
-
end
|
|
77
|
-
|
|
78
71
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
else
|
|
84
|
-
username = 'resourcedir' # gets it into the right subdirectory automatically!!!
|
|
85
|
-
dirscrapername = ds.m_scrapername
|
|
86
|
-
if ds.m_scrapername == '' or ds.m_scrapername.nil?
|
|
87
|
-
dirscrapername = 'DRAFT__' + ds.m_runid.gsub(/[\.\-]/, '_')
|
|
88
|
-
end
|
|
89
|
-
path = "%s/%s" % [username, dirscrapername]
|
|
90
|
-
|
|
91
|
-
record = {"query"=>sqlquery, "params"=>data, "attach"=>[]}
|
|
92
|
-
$attachlist.each do |value|
|
|
93
|
-
record["attach"].push({"user"=>username, "database"=>value["name"], "alias"=>value["asattach"], "securityhash"=>"somthing"})
|
|
94
|
-
end
|
|
95
|
-
|
|
96
|
-
httpcall = Net::HTTP.new(ds.m_host, ds.m_webstore_port)
|
|
97
|
-
headers = { "Accept"=>"application/json+tuples", "X-Scrapername"=>ds.m_scrapername, "X-Runid"=>ds.m_runid, "Content-Type"=>"application/json" }
|
|
98
|
-
response = httpcall.put(path, JSON.generate(record), headers)
|
|
99
|
-
res = JSON.parse(response.body)
|
|
100
|
-
if res["state"] == "error"
|
|
101
|
-
ScraperWiki.raisesqliteerror(res["message"])
|
|
102
|
-
end
|
|
103
|
-
if (res.class == Hash) and (res["keys"].class == Array) and (res["data"].class == Array)
|
|
104
|
-
if res["keys"].include?("state") and (res["data"].length == 1)
|
|
105
|
-
ddata = Hash[*res["keys"].zip(res["data"][0]).flatten]
|
|
106
|
-
if ddata["state"] == "error"
|
|
107
|
-
ScraperWiki.raisesqliteerror(ddata["message"])
|
|
108
|
-
end
|
|
109
|
-
end
|
|
110
|
-
end
|
|
111
|
-
end
|
|
72
|
+
rjdata = [ ]
|
|
73
|
+
for ldata in data
|
|
74
|
+
ljdata = _convdata(unique_keys, ldata)
|
|
75
|
+
rjdata.push(ljdata)
|
|
112
76
|
|
|
113
|
-
if verbose
|
|
114
|
-
if data.kind_of?(Array)
|
|
115
|
-
data.each do |value|
|
|
116
|
-
ldata = [ ]
|
|
117
|
-
if value == nil
|
|
118
|
-
value = ''
|
|
119
|
-
end
|
|
120
|
-
ldata.push(ScraperWiki._unicode_truncate(value.to_s, 50))
|
|
121
|
-
end
|
|
122
|
-
else
|
|
123
|
-
ldata = data
|
|
124
|
-
end
|
|
125
|
-
ScraperWiki.dumpMessage({'message_type'=>'sqlitecall', 'command'=>"execute", 'val1'=>sqlquery, 'val2'=>ldata})
|
|
126
77
|
end
|
|
127
|
-
return res
|
|
128
|
-
end
|
|
129
|
-
|
|
130
78
|
|
|
79
|
+
SQLiteMagic._do_save_sqlite(unique_keys, rjdata, table_name)
|
|
80
|
+
end
|
|
131
81
|
|
|
132
|
-
|
|
82
|
+
# Internal function to check a row of data, convert to right format
|
|
133
83
|
def ScraperWiki._convdata(unique_keys, scraper_data)
|
|
134
84
|
if unique_keys
|
|
135
85
|
for key in unique_keys
|
|
136
86
|
if !key.kind_of?(String) and !key.kind_of?(Symbol)
|
|
137
|
-
return
|
|
87
|
+
return 'unique_keys must each be a string or a symbol, this one is not: ' + key
|
|
138
88
|
end
|
|
139
89
|
if !scraper_data.include?(key) and !scraper_data.include?(key.to_sym)
|
|
140
|
-
return
|
|
90
|
+
return 'unique_keys must be a subset of data, this one is not: ' + key
|
|
141
91
|
end
|
|
142
92
|
if scraper_data[key] == nil and scraper_data[key.to_sym] == nil
|
|
143
|
-
return
|
|
93
|
+
return 'unique_key value should not be nil, this one is nil: ' + key
|
|
144
94
|
end
|
|
145
95
|
end
|
|
146
96
|
end
|
|
147
97
|
|
|
148
98
|
jdata = { }
|
|
149
99
|
scraper_data.each_pair do |key, value|
|
|
150
|
-
if not key
|
|
151
|
-
return { "error" => 'key must not be blank', "bad_key" => key }
|
|
152
|
-
end
|
|
153
|
-
if key.kind_of?(Symbol)
|
|
154
|
-
key = key.to_s
|
|
155
|
-
end
|
|
156
|
-
if key.class != String
|
|
157
|
-
return { "error" => 'key must be string type', "bad_key" => key }
|
|
158
|
-
end
|
|
100
|
+
raise 'key must not have blank name' if not key
|
|
159
101
|
|
|
160
|
-
if
|
|
161
|
-
|
|
162
|
-
|
|
102
|
+
key = key.to_s if key.kind_of?(Symbol)
|
|
103
|
+
raise 'key must be string or symbol type: ' + key if key.class != String
|
|
104
|
+
raise 'key must be simple text: ' + key if !/[a-zA-Z0-9_\- ]+$/.match(key)
|
|
163
105
|
|
|
164
|
-
|
|
106
|
+
# convert formats
|
|
107
|
+
if value.kind_of?(Date)
|
|
165
108
|
value = value.iso8601
|
|
166
109
|
end
|
|
167
110
|
if value.kind_of?(Time)
|
|
168
111
|
value = value.iso8601
|
|
169
|
-
raise "internal error, timezone came out as non-UTC while converting to SQLite format" unless value.match(
|
|
170
|
-
value.gsub!(
|
|
112
|
+
raise "internal error, timezone came out as non-UTC while converting to SQLite format" unless value.match(/([+-]00:00|Z)$/)
|
|
113
|
+
value.gsub!(/([+-]00:00|Z)$/, '')
|
|
171
114
|
end
|
|
172
115
|
if ![Fixnum, Float, String, TrueClass, FalseClass, NilClass].include?(value.class)
|
|
173
116
|
value = value.to_s
|
|
174
117
|
end
|
|
118
|
+
|
|
175
119
|
jdata[key] = value
|
|
176
120
|
end
|
|
177
121
|
return jdata
|
|
178
122
|
end
|
|
179
123
|
|
|
180
|
-
|
|
181
|
-
def ScraperWiki.save_sqlite(unique_keys, data, table_name="swdata", verbose=2)
|
|
182
|
-
if !data
|
|
183
|
-
ScraperWiki.dumpMessage({'message_type' => 'data', 'content' => "EMPTY SAVE IGNORED"})
|
|
184
|
-
return
|
|
185
|
-
end
|
|
186
|
-
|
|
187
|
-
# convert :symbols to "strings"
|
|
188
|
-
unique_keys = unique_keys.map { |x| x.kind_of?(Symbol) ? x.to_s : x }
|
|
189
|
-
|
|
190
|
-
if data.class == Hash
|
|
191
|
-
data = [ data ]
|
|
192
|
-
end
|
|
193
|
-
|
|
194
|
-
rjdata = [ ]
|
|
195
|
-
for ldata in data
|
|
196
|
-
ljdata = _convdata(unique_keys, ldata)
|
|
197
|
-
if ljdata.include?("error")
|
|
198
|
-
raise SqliteException.new(ljdata["error"])
|
|
199
|
-
end
|
|
200
|
-
rjdata.push(ljdata)
|
|
201
|
-
end
|
|
202
|
-
|
|
203
|
-
ds = SW_DataStore.create()
|
|
204
|
-
if ds.m_webstore_port == 0
|
|
205
|
-
res = ds.request({'maincommand'=>'save_sqlite', 'unique_keys'=>unique_keys, 'data'=>rjdata, 'swdatatblname'=>table_name})
|
|
206
|
-
else
|
|
207
|
-
username = 'resourcedir' # gets it into the right subdirectory automatically!!!
|
|
208
|
-
dirscrapername = ds.m_scrapername
|
|
209
|
-
if ds.m_scrapername == '' or ds.m_scrapername.nil?
|
|
210
|
-
dirscrapername = 'DRAFT__' + ds.m_runid.gsub(/[\.\-]/, '_')
|
|
211
|
-
end
|
|
212
|
-
|
|
213
|
-
# (do something about jargtypes later)
|
|
214
|
-
qsl = [ ]
|
|
215
|
-
unique_keys.each do |key|
|
|
216
|
-
qsl.push("unique="+URI.encode(key))
|
|
217
|
-
end
|
|
218
|
-
|
|
219
|
-
# quick and dirty provision of column types to the webstore
|
|
220
|
-
if rjdata.length != 0
|
|
221
|
-
jargtypes = { }
|
|
222
|
-
rjdata[0].each_pair do |k, v|
|
|
223
|
-
if v != nil
|
|
224
|
-
#if k[-5..-1] == "_blob"
|
|
225
|
-
# vt = "blob" # coerced into affinity none
|
|
226
|
-
if v.class == Fixnum
|
|
227
|
-
vt = "integer"
|
|
228
|
-
elsif v.class == Float
|
|
229
|
-
vt = "real"
|
|
230
|
-
else
|
|
231
|
-
vt = "text"
|
|
232
|
-
end
|
|
233
|
-
jargtypes[k] = vt
|
|
234
|
-
end
|
|
235
|
-
end
|
|
236
|
-
qsl.push(("jargtypes="+JSON.generate(jargtypes)))
|
|
237
|
-
end
|
|
238
|
-
|
|
239
|
-
path = "%s/%s/%s?%s" % [username, dirscrapername, table_name, qsl.join("&")]
|
|
240
|
-
#puts JSON.generate(rjdata)
|
|
241
|
-
httpcall = Net::HTTP.new(ds.m_host, ds.m_webstore_port)
|
|
242
|
-
headers = { "Accept"=>"application/json", "X-Scrapername"=>ds.m_scrapername, "X-Runid"=>ds.m_runid, "Content-Type"=>"application/json" }
|
|
243
|
-
response = httpcall.post(path, JSON.generate(rjdata), headers)
|
|
244
|
-
#puts response.body
|
|
245
|
-
res = JSON.parse(response.body)
|
|
246
|
-
if res["state"] == "error"
|
|
247
|
-
res["error"] = res["message"]
|
|
248
|
-
end
|
|
249
|
-
end
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
if res["error"]
|
|
253
|
-
raise SqliteException.new(res["error"])
|
|
254
|
-
end
|
|
255
|
-
|
|
256
|
-
if verbose >= 2
|
|
257
|
-
pdata = { }
|
|
258
|
-
if rjdata.class == Hash
|
|
259
|
-
sdata = rjdata
|
|
260
|
-
else
|
|
261
|
-
sdata = rjdata[0]
|
|
262
|
-
end
|
|
263
|
-
sdata.each_pair do |key, value|
|
|
264
|
-
key = ScraperWiki._unicode_truncate(key.to_s, 50)
|
|
265
|
-
if value == nil
|
|
266
|
-
value = ''
|
|
267
|
-
else
|
|
268
|
-
value = ScraperWiki._unicode_truncate(value.to_s, 50)
|
|
269
|
-
end
|
|
270
|
-
pdata[key] = String(value)
|
|
271
|
-
end
|
|
272
|
-
if rjdata.class == Array and rjdata.size > 1
|
|
273
|
-
pdata["number_records"] = "Number Records: "+String(rjdata.size)
|
|
274
|
-
end
|
|
275
|
-
ScraperWiki.dumpMessage({'message_type' => 'data', 'content' => pdata})
|
|
276
|
-
end
|
|
277
|
-
return res
|
|
278
|
-
end
|
|
279
|
-
|
|
280
|
-
# also needs to handle the types better (could save json and datetime objects handily
|
|
281
|
-
def ScraperWiki.save_var(name, value, verbose=2)
|
|
282
|
-
vtype = String(value.class)
|
|
283
|
-
svalue = value.to_s
|
|
284
|
-
if vtype != "Fixnum" and vtype != "String" and vtype != "Float" and vtype != "NilClass"
|
|
285
|
-
puts "*** object of type "+vtype+" converted to string\n"
|
|
286
|
-
end
|
|
287
|
-
data = { "name" => name, "value_blob" => svalue, "type" => vtype }
|
|
288
|
-
ScraperWiki.save_sqlite(unique_keys=["name"], data=data, table_name="swvariables", verbose=verbose)
|
|
289
|
-
end
|
|
290
|
-
|
|
291
|
-
def ScraperWiki.get_var(name, default=nil, verbose=2)
|
|
292
|
-
begin
|
|
293
|
-
result = ScraperWiki.sqliteexecute("select value_blob, type from swvariables where name=?", [name], verbose)
|
|
294
|
-
rescue NoSuchTableSqliteException => e
|
|
295
|
-
return default
|
|
296
|
-
end
|
|
297
|
-
|
|
298
|
-
if !result.has_key?("data")
|
|
299
|
-
return default
|
|
300
|
-
end
|
|
301
|
-
|
|
302
|
-
if result["data"].length == 0
|
|
303
|
-
return default
|
|
304
|
-
end
|
|
305
|
-
# consider casting to type
|
|
306
|
-
svalue = result["data"][0][0]
|
|
307
|
-
vtype = result["data"][0][1]
|
|
308
|
-
if vtype == "Fixnum"
|
|
309
|
-
return svalue.to_i
|
|
310
|
-
end
|
|
311
|
-
if vtype == "Float"
|
|
312
|
-
return svalue.to_f
|
|
313
|
-
end
|
|
314
|
-
if vtype == "NilClass"
|
|
315
|
-
return nil
|
|
316
|
-
end
|
|
317
|
-
return svalue
|
|
318
|
-
end
|
|
319
|
-
|
|
320
|
-
# These are DEPRECATED and just here for compatibility
|
|
321
|
-
def ScraperWiki.get_metadata(metadata_name, default = nil)
|
|
322
|
-
if !$metadatamessagedone == nil
|
|
323
|
-
puts "*** instead of get_metadata('"+metadata_name+"') please use\n get_var('"+metadata_name+"')"
|
|
324
|
-
metadatamessagedone = true
|
|
325
|
-
end
|
|
326
|
-
result = ScraperWiki.get_var(metadata_name, default)
|
|
327
|
-
return result
|
|
328
|
-
end
|
|
329
|
-
|
|
330
|
-
# These are DEPRECATED and just here for compatibility
|
|
331
|
-
def ScraperWiki.save_metadata(metadata_name, value)
|
|
332
|
-
if !$metadatamessagedone
|
|
333
|
-
puts "*** instead of save_metadata('"+metadata_name+"') please use\n save_var('"+metadata_name+"')"
|
|
334
|
-
$metadatamessagedone = true
|
|
335
|
-
end
|
|
336
|
-
return ScraperWiki.save_var(metadata_name, value)
|
|
337
|
-
end
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
def ScraperWiki.show_tables(dbname=nil)
|
|
341
|
-
name = "sqlite_master"
|
|
342
|
-
if dbname != nil
|
|
343
|
-
name = "`"+dbname+"`.sqlite_master"
|
|
344
|
-
end
|
|
345
|
-
result = ScraperWiki.sqliteexecute("select tbl_name, sql from "+name+" where type='table'")
|
|
346
|
-
#return result["data"]
|
|
347
|
-
return (Hash[*result["data"].flatten]) # pre-1.8.7
|
|
348
|
-
end
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
def ScraperWiki.table_info(name)
|
|
352
|
-
sname = name.split(".")
|
|
353
|
-
if sname.length == 2
|
|
354
|
-
result = ScraperWiki.sqliteexecute("PRAGMA %s.table_info(`%s`)" % sname)
|
|
355
|
-
else
|
|
356
|
-
result = ScraperWiki.sqliteexecute("PRAGMA table_info(`%s`)" % name)
|
|
357
|
-
end
|
|
358
|
-
res = [ ]
|
|
359
|
-
for d in result["data"]
|
|
360
|
-
res.push(Hash[*result["keys"].zip(d).flatten]) # pre-1.8.7
|
|
361
|
-
end
|
|
362
|
-
return res
|
|
363
|
-
end
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
def ScraperWiki.getDataByDate(name, start_date, end_date, limit=-1, offset=0)
|
|
367
|
-
raise SqliteException.new("getDataByDate has been deprecated")
|
|
368
|
-
end
|
|
369
|
-
|
|
370
|
-
def ScraperWiki.getDataByLocation(name, lat, lng, limit=-1, offset=0)
|
|
371
|
-
raise SqliteException.new("getDataByLocation has been deprecated")
|
|
372
|
-
end
|
|
373
|
-
|
|
374
|
-
def ScraperWiki.search(name, filterdict, limit=-1, offset=0)
|
|
375
|
-
raise SqliteException.new("SW_APIWrapper.search has been deprecated")
|
|
376
|
-
end
|
|
377
|
-
|
|
378
|
-
def ScraperWiki.raisesqliteerror(rerror)
|
|
379
|
-
if /sqlite3.Error: no such table:/.match(rerror) # old dataproxy
|
|
380
|
-
raise NoSuchTableSqliteException.new(rerror)
|
|
381
|
-
end
|
|
382
|
-
if /DB Error: \(OperationalError\) no such table:/.match(rerror)
|
|
383
|
-
raise NoSuchTableSqliteException.new(rerror)
|
|
384
|
-
end
|
|
385
|
-
raise SqliteException.new(rerror)
|
|
386
|
-
end
|
|
387
|
-
|
|
388
|
-
def ScraperWiki.attach(name, asname=nil, verbose=1)
|
|
389
|
-
$attachlist.push({"name"=>name, "asname"=>asname})
|
|
390
|
-
|
|
391
|
-
ds = SW_DataStore.create()
|
|
392
|
-
|
|
393
|
-
if ds.m_webstore_port == 0
|
|
394
|
-
res = ds.request({'maincommand'=>'sqlitecommand', 'command'=>"attach", 'name'=>name, 'asname'=>asname})
|
|
395
|
-
if res["error"]
|
|
396
|
-
ScraperWiki.raisesqliteerror(res)
|
|
397
|
-
end
|
|
398
|
-
else
|
|
399
|
-
res = {'status'=>'ok'}
|
|
400
|
-
end
|
|
401
|
-
|
|
402
|
-
if verbose
|
|
403
|
-
ScraperWiki.dumpMessage({'message_type'=>'sqlitecall', 'command'=>"attach", 'val1'=>name, 'val2'=>asname})
|
|
404
|
-
end
|
|
405
|
-
|
|
406
|
-
return res
|
|
407
|
-
end
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
def ScraperWiki.commit(verbose=1)
|
|
411
|
-
ds = SW_DataStore.create()
|
|
412
|
-
if ds.m_webstore_port == 0
|
|
413
|
-
res = ds.request({'maincommand'=>'sqlitecommand', 'command'=>"commit"})
|
|
414
|
-
else
|
|
415
|
-
puts "*** commit() no longer a necessary function call"
|
|
416
|
-
res = {'status'=>'ok'}
|
|
417
|
-
end
|
|
418
|
-
end
|
|
419
|
-
|
|
420
|
-
def ScraperWiki.select(sqlquery, data=nil, verbose=1)
|
|
421
|
-
if data != nil && sqlquery.scan(/\?/).length != 0 && data.class != Array
|
|
422
|
-
data = [data]
|
|
423
|
-
end
|
|
424
|
-
result = ScraperWiki.sqliteexecute("select "+sqlquery, data, verbose)
|
|
425
|
-
res = [ ]
|
|
426
|
-
for d in result["data"]
|
|
427
|
-
#res.push(Hash[result["keys"].zip(d)]) # post-1.8.7
|
|
428
|
-
res.push(Hash[*result["keys"].zip(d).flatten]) # pre-1.8.7
|
|
429
|
-
end
|
|
430
|
-
return res
|
|
431
|
-
end
|
|
432
|
-
|
|
433
|
-
# old functions put back in for regression
|
|
434
|
-
def ScraperWiki.getData(name, limit=-1, offset=0)
|
|
435
|
-
if !$apiwrapperattacheddata.include?(name)
|
|
436
|
-
puts "*** instead of getData('"+name+"') please use\n ScraperWiki.attach('"+name+"') \n print ScraperWiki.select('* from `"+name+"`.swdata')"
|
|
437
|
-
ScraperWiki.attach(name)
|
|
438
|
-
$apiwrapperattacheddata.push(name)
|
|
439
|
-
end
|
|
440
|
-
|
|
441
|
-
apilimit = 500
|
|
442
|
-
g = Enumerator.new do |g|
|
|
443
|
-
count = 0
|
|
444
|
-
while true
|
|
445
|
-
if limit == -1
|
|
446
|
-
step = apilimit
|
|
447
|
-
else
|
|
448
|
-
step = apilimit < (limit - count) ? apilimit : limit - count
|
|
449
|
-
end
|
|
450
|
-
query = "* from `#{name}`.swdata limit #{step} offset #{offset+count}"
|
|
451
|
-
|
|
452
|
-
records = ScraperWiki.select(query)
|
|
453
|
-
for r in records
|
|
454
|
-
g.yield r
|
|
455
|
-
end
|
|
456
|
-
|
|
457
|
-
count += records.length
|
|
458
|
-
if records.length < step
|
|
459
|
-
break
|
|
460
|
-
end
|
|
461
|
-
if limit != -1 and count >= limit
|
|
462
|
-
break
|
|
463
|
-
end
|
|
464
|
-
end
|
|
465
|
-
end
|
|
466
|
-
end
|
|
467
|
-
|
|
468
|
-
def ScraperWiki.getKeys(name)
|
|
469
|
-
if !$apiwrapperattacheddata.include?(name)
|
|
470
|
-
puts "*** instead of getKeys('"+name+"') please use\n ScraperWiki.attach('"+name+"') \n print ScraperWiki.sqliteexecute('select * from `"+name+"`.swdata limit 0')['keys']"
|
|
471
|
-
ScraperWiki.attach(name)
|
|
472
|
-
$apiwrapperattacheddata.push(name)
|
|
473
|
-
end
|
|
474
|
-
result = ScraperWiki.sqliteexecute("select * from `"+name+"`.swdata limit 0")
|
|
475
|
-
if result.include?("error")
|
|
476
|
-
raise SqliteException.new(result["error"])
|
|
477
|
-
end
|
|
478
|
-
return result["keys"]
|
|
479
|
-
end
|
|
480
124
|
end
|
metadata
CHANGED
|
@@ -1,98 +1,45 @@
|
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: scraperwiki
|
|
3
|
-
version: !ruby/object:Gem::Version
|
|
4
|
-
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 2.0.0
|
|
5
5
|
prerelease:
|
|
6
|
-
segments:
|
|
7
|
-
- 1
|
|
8
|
-
- 0
|
|
9
|
-
- 1
|
|
10
|
-
version: 1.0.1
|
|
11
6
|
platform: ruby
|
|
12
|
-
authors:
|
|
13
|
-
- Francis
|
|
7
|
+
authors:
|
|
8
|
+
- Francis irving
|
|
14
9
|
autorequire:
|
|
15
10
|
bindir: bin
|
|
16
11
|
cert_chain: []
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
- !ruby/object:Gem::Dependency
|
|
22
|
-
name: json
|
|
23
|
-
prerelease: false
|
|
24
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
|
25
|
-
none: false
|
|
26
|
-
requirements:
|
|
27
|
-
- - ">="
|
|
28
|
-
- !ruby/object:Gem::Version
|
|
29
|
-
hash: 3
|
|
30
|
-
segments:
|
|
31
|
-
- 0
|
|
32
|
-
version: "0"
|
|
33
|
-
type: :runtime
|
|
34
|
-
version_requirements: *id001
|
|
35
|
-
- !ruby/object:Gem::Dependency
|
|
36
|
-
name: httpclient
|
|
37
|
-
prerelease: false
|
|
38
|
-
requirement: &id002 !ruby/object:Gem::Requirement
|
|
39
|
-
none: false
|
|
40
|
-
requirements:
|
|
41
|
-
- - ">="
|
|
42
|
-
- !ruby/object:Gem::Version
|
|
43
|
-
hash: 3
|
|
44
|
-
segments:
|
|
45
|
-
- 0
|
|
46
|
-
version: "0"
|
|
47
|
-
type: :runtime
|
|
48
|
-
version_requirements: *id002
|
|
49
|
-
description: Ruby code used for accessing
|
|
50
|
-
email:
|
|
51
|
-
- francis@scraperwiki.com
|
|
12
|
+
date: 2013-04-04 00:00:00.000000000 Z
|
|
13
|
+
dependencies: []
|
|
14
|
+
description: A library for scraping web pages and saving data easily
|
|
15
|
+
email: francis@scraperwiki.com
|
|
52
16
|
executables: []
|
|
53
|
-
|
|
54
17
|
extensions: []
|
|
55
|
-
|
|
56
18
|
extra_rdoc_files: []
|
|
57
|
-
|
|
58
|
-
files:
|
|
59
|
-
- lib/version.rb
|
|
19
|
+
files:
|
|
60
20
|
- lib/scraperwiki.rb
|
|
61
|
-
|
|
62
|
-
- lib/scraperwiki/stacktrace.rb
|
|
63
|
-
has_rdoc: true
|
|
64
|
-
homepage: http://scraperwiki.com
|
|
21
|
+
homepage: http://rubygems.org/gems/scraperwiki
|
|
65
22
|
licenses: []
|
|
66
|
-
|
|
67
23
|
post_install_message:
|
|
68
24
|
rdoc_options: []
|
|
69
|
-
|
|
70
|
-
require_paths:
|
|
25
|
+
require_paths:
|
|
71
26
|
- lib
|
|
72
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
|
27
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
73
28
|
none: false
|
|
74
|
-
requirements:
|
|
75
|
-
- -
|
|
76
|
-
- !ruby/object:Gem::Version
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
- 0
|
|
80
|
-
version: "0"
|
|
81
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - ! '>='
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '0'
|
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
82
34
|
none: false
|
|
83
|
-
requirements:
|
|
84
|
-
- -
|
|
85
|
-
- !ruby/object:Gem::Version
|
|
86
|
-
|
|
87
|
-
segments:
|
|
88
|
-
- 0
|
|
89
|
-
version: "0"
|
|
35
|
+
requirements:
|
|
36
|
+
- - ! '>='
|
|
37
|
+
- !ruby/object:Gem::Version
|
|
38
|
+
version: '0'
|
|
90
39
|
requirements: []
|
|
91
|
-
|
|
92
40
|
rubyforge_project:
|
|
93
|
-
rubygems_version: 1.
|
|
41
|
+
rubygems_version: 1.8.23
|
|
94
42
|
signing_key:
|
|
95
43
|
specification_version: 3
|
|
96
|
-
summary: ScraperWiki
|
|
44
|
+
summary: ScraperWiki
|
|
97
45
|
test_files: []
|
|
98
|
-
|
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
require 'json'
|
|
2
|
-
require 'singleton'
|
|
3
|
-
require 'thread'
|
|
4
|
-
require 'cgi'
|
|
5
|
-
|
|
6
|
-
# the python version of this makes use of a global static copy of the class
|
|
7
|
-
# so the connection is made only once to the dataproxy
|
|
8
|
-
# I think the Singleton module implements this magically
|
|
9
|
-
|
|
10
|
-
class SW_DataStore
|
|
11
|
-
|
|
12
|
-
@@lock = Mutex.new
|
|
13
|
-
|
|
14
|
-
include Singleton
|
|
15
|
-
|
|
16
|
-
attr_accessor :m_port, :m_host, :m_scrapername, :m_runid, :m_attachables, :m_webstore_port
|
|
17
|
-
|
|
18
|
-
def initialize
|
|
19
|
-
@m_socket = nil
|
|
20
|
-
@m_host = nil
|
|
21
|
-
@m_port = nil
|
|
22
|
-
@m_scrapername = ''
|
|
23
|
-
@m_runid = ''
|
|
24
|
-
@m_attachables = []
|
|
25
|
-
@webstore_port = 0
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def ensure_connected
|
|
30
|
-
# Connect to the data proxy. The data proxy will need to make an Ident call
|
|
31
|
-
# back to get the scraperID. Since the data proxy may be on another machine
|
|
32
|
-
# and the peer address it sees will have been subject to NAT or masquerading,
|
|
33
|
-
# send the UML name and the socket port number in the request.
|
|
34
|
-
|
|
35
|
-
if @m_socket == nil
|
|
36
|
-
@m_socket = TCPSocket.open(@m_host, @m_port)
|
|
37
|
-
proto, port, name, ip = @m_socket.addr()
|
|
38
|
-
if @m_scrapername == '' or @m_scrapername.nil?
|
|
39
|
-
sname = ''
|
|
40
|
-
else
|
|
41
|
-
sname = CGI::escape(@m_scrapername)
|
|
42
|
-
end
|
|
43
|
-
if @m_runid == '' or @m_runid.nil?
|
|
44
|
-
rid = ''
|
|
45
|
-
else
|
|
46
|
-
rid = CGI::escape(@m_runid)
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
getmsg = "GET /?uml=%s&port=%s&vscrapername=%s&vrunid=%s HTTP/1.1\n\n" % ['lxc', port, sname, rid]
|
|
50
|
-
@m_socket.send(getmsg, 0)
|
|
51
|
-
@m_socket.flush()
|
|
52
|
-
|
|
53
|
-
buffer = @m_socket.recv(1024)
|
|
54
|
-
result = JSON.parse(buffer)
|
|
55
|
-
if result["status"] != "good"
|
|
56
|
-
raise result["status"]
|
|
57
|
-
end
|
|
58
|
-
end
|
|
59
|
-
end
|
|
60
|
-
|
|
61
|
-
def request (req)
|
|
62
|
-
text = ''
|
|
63
|
-
@@lock.synchronize {
|
|
64
|
-
ensure_connected
|
|
65
|
-
reqmsg = JSON.generate(req) + "\n"
|
|
66
|
-
|
|
67
|
-
bytes_sent = 0
|
|
68
|
-
while bytes_sent < reqmsg.length
|
|
69
|
-
bytes_sent += @m_socket.send(reqmsg.slice(bytes_sent, reqmsg.length), 0)
|
|
70
|
-
end
|
|
71
|
-
@m_socket.flush()
|
|
72
|
-
|
|
73
|
-
while true
|
|
74
|
-
buffer = @m_socket.recv(1024)
|
|
75
|
-
if buffer.length == 0
|
|
76
|
-
break
|
|
77
|
-
end
|
|
78
|
-
text += buffer
|
|
79
|
-
if text[-1] == "\n"[0]
|
|
80
|
-
break
|
|
81
|
-
end
|
|
82
|
-
end
|
|
83
|
-
}
|
|
84
|
-
return JSON.parse(text)
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
# function used to both initialize the settings and get an instance!
|
|
88
|
-
# this is ridiculous and unnecessary with new webstore.
|
|
89
|
-
# we are creating object without the fields merely to access the static variables!
|
|
90
|
-
def SW_DataStore.create(host=nil, port = nil, scrapername = '', runid = nil, attachables = nil, webstore_port = nil)
|
|
91
|
-
instance = SW_DataStore.instance
|
|
92
|
-
# so, it might be intended that the host and port are
|
|
93
|
-
# set once, never to be changed, but this is ruby so
|
|
94
|
-
# there's no way to guarantee that.
|
|
95
|
-
if host && port && instance.m_port.nil? && instance.m_host.nil?
|
|
96
|
-
instance.m_host = host
|
|
97
|
-
instance.m_port = port
|
|
98
|
-
instance.m_scrapername = scrapername
|
|
99
|
-
instance.m_runid = runid
|
|
100
|
-
instance.m_attachables = attachables
|
|
101
|
-
instance.m_webstore_port = webstore_port
|
|
102
|
-
elsif host && port
|
|
103
|
-
raise "Can't change host and port once connection made"
|
|
104
|
-
elsif !(instance.m_port) || !(instance.m_host)
|
|
105
|
-
raise "Can't return a datastore without port/host information"
|
|
106
|
-
end
|
|
107
|
-
instance
|
|
108
|
-
end
|
|
109
|
-
end
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
def _get_stackentry(code_filename, code, filename, linenumber, funcname)
|
|
2
|
-
nlinenumber = linenumber.to_i
|
|
3
|
-
stackentry = {"file" => filename, "linenumber" => nlinenumber, "duplicates" => 1}
|
|
4
|
-
|
|
5
|
-
if filename == "(eval)" or filename == code_filename
|
|
6
|
-
codelines = code.split("\n")
|
|
7
|
-
if (nlinenumber >= 1) && (nlinenumber <= codelines.size)
|
|
8
|
-
stackentry["linetext"] = codelines[nlinenumber-1]
|
|
9
|
-
elsif (nlinenumber == codelines.size + 1)
|
|
10
|
-
stackentry["linetext"] = "<end of file>"
|
|
11
|
-
else
|
|
12
|
-
stackentry["linetext"] = "getExceptionTraceback: ScraperWiki internal error, line %d out of range in file %s" % [nlinenumber, code_filename]
|
|
13
|
-
end
|
|
14
|
-
stackentry["file"] = "<string>"
|
|
15
|
-
else
|
|
16
|
-
# XXX bit of a hack to show the line number in third party libraries
|
|
17
|
-
stackentry["file"] += ":" + linenumber
|
|
18
|
-
end
|
|
19
|
-
if funcname
|
|
20
|
-
stackentry["furtherlinetext"] = funcname
|
|
21
|
-
end
|
|
22
|
-
return stackentry
|
|
23
|
-
end
|
|
24
|
-
|
|
25
|
-
def getExceptionTraceback(e, code, code_filename)
|
|
26
|
-
lbacktrace = e.backtrace.reverse
|
|
27
|
-
#File.open("/tmp/fairuby", 'a') {|f| f.write(JSON.generate(lbacktrace)) }
|
|
28
|
-
|
|
29
|
-
exceptiondescription = e.to_s
|
|
30
|
-
|
|
31
|
-
stackdump = []
|
|
32
|
-
for l in lbacktrace
|
|
33
|
-
(filename, linenumber, funcname) = l.split(":")
|
|
34
|
-
|
|
35
|
-
next if filename.match(/\/exec.rb$/) # skip showing stack of wrapper
|
|
36
|
-
|
|
37
|
-
stackentry = _get_stackentry(code_filename, code, filename, linenumber, funcname)
|
|
38
|
-
stackdump.push(stackentry)
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
if e.kind_of?(SyntaxError)
|
|
42
|
-
(filename, linenumber, message) = exceptiondescription.split(/[:\n]/, 3)
|
|
43
|
-
exceptiondescription = message
|
|
44
|
-
|
|
45
|
-
stackentry = _get_stackentry(code_filename, code, filename, linenumber, nil)
|
|
46
|
-
stackdump.push(stackentry)
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
return { 'message_type' => 'exception', 'exceptiondescription' => exceptiondescription, "stackdump" => stackdump }
|
|
50
|
-
end
|
|
51
|
-
|
data/lib/version.rb
DELETED