scraperwiki 1.0.1 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/scraperwiki.rb +75 -431
- metadata +24 -77
- data/lib/scraperwiki/datastore.rb +0 -109
- data/lib/scraperwiki/stacktrace.rb +0 -51
- data/lib/version.rb +0 -4
data/lib/scraperwiki.rb
CHANGED
@@ -1,480 +1,124 @@
|
|
1
|
-
require '
|
2
|
-
|
3
|
-
require
|
4
|
-
require 'scraperwiki/datastore'
|
5
|
-
require 'httpclient'
|
6
|
-
|
7
|
-
class SqliteException < RuntimeError
|
8
|
-
end
|
9
|
-
|
10
|
-
class NoSuchTableSqliteException < SqliteException
|
11
|
-
end
|
12
|
-
|
13
|
-
$apiwrapperattacheddata = [ ]
|
1
|
+
require 'sqlite3'
|
2
|
+
$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__)))
|
3
|
+
require 'scraperwiki/sqlite_save_info.rb'
|
14
4
|
|
15
5
|
module ScraperWiki
|
16
6
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
7
|
+
# The scrape method fetches the content from a webserver.
|
8
|
+
#
|
9
|
+
# === Parameters
|
10
|
+
#
|
11
|
+
# * _url_ = The URL to fetch
|
12
|
+
# * _params_ = The parameters to send with a POST request
|
13
|
+
# * _agent = A manually supplied useragent string
|
14
|
+
#
|
15
|
+
# === Example
|
16
|
+
# ScraperWiki::scrape('http://scraperwiki.com')
|
17
|
+
#
|
18
|
+
def ScraperWiki.scrape(url, params = nil, agent = nil)
|
19
|
+
if agent
|
20
|
+
client = HTTPClient.new(:agent_name => agent)
|
21
|
+
else
|
22
|
+
client = HTTPClient.new
|
23
|
+
end
|
32
24
|
client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
25
|
+
if HTTPClient.respond_to?("client.transparent_gzip_decompression=")
|
26
|
+
client.transparent_gzip_decompression = true
|
27
|
+
end
|
33
28
|
|
34
|
-
if params.nil?
|
35
|
-
|
29
|
+
if params.nil?
|
30
|
+
html = client.get_content(url)
|
36
31
|
else
|
37
|
-
|
32
|
+
html = client.post_content(url, params)
|
38
33
|
end
|
39
|
-
end
|
40
34
|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
35
|
+
unless HTTPClient.respond_to?("client.transparent_gzip_decompression=")
|
36
|
+
begin
|
37
|
+
gz = Zlib::GzipReader.new(StringIO.new(html))
|
38
|
+
return gz.read
|
39
|
+
rescue
|
40
|
+
return html
|
47
41
|
end
|
48
|
-
|
42
|
+
end
|
49
43
|
end
|
50
44
|
|
45
|
+
# Saves the provided data into a local database for this scraper. Data is upserted
|
46
|
+
# into this table (inserted if it does not exist, updated if the unique keys say it
|
47
|
+
# does).
|
48
|
+
#
|
49
|
+
# === Parameters
|
50
|
+
#
|
51
|
+
# * _unique_keys_ = A list of column names, that used together should be unique
|
52
|
+
# * _data_ = A hash of the data where the Key is the column name, the Value the row
|
53
|
+
# value. If sending lots of data this can be a list of hashes.
|
54
|
+
# * _table_name_ = The name that the newly created table should use.
|
55
|
+
#
|
56
|
+
# === Example
|
57
|
+
# ScraperWiki::save(['id'], {'id'=>1})
|
58
|
+
#
|
59
|
+
def ScraperWiki.save_sqlite(unique_keys, data, table_name="swdata")
|
60
|
+
raise 'unique_keys must be nil or an array' if unique_keys != nil && !unique_keys.kind_of?(Array)
|
61
|
+
raise 'data must have a non-nil value' if data == nil
|
51
62
|
|
63
|
+
# convert :symbols to "strings"
|
64
|
+
unique_keys = unique_keys.map { |x| x.kind_of?(Symbol) ? x.to_s : x }
|
52
65
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
def ScraperWiki.save(unique_keys, data, date=nil, latlng=nil, table_name="swdata")
|
59
|
-
if unique_keys != nil && !unique_keys.kind_of?(Array)
|
60
|
-
raise 'unique_keys must be nil or an array'
|
61
|
-
end
|
62
|
-
if data == nil
|
63
|
-
raise 'data must have a non-nil value'
|
64
|
-
end
|
65
|
-
|
66
|
-
ds = SW_DataStore.create()
|
67
|
-
ldata = data.dup
|
68
|
-
if date != nil
|
69
|
-
ldata["date"] = date
|
70
|
-
end
|
71
|
-
if latlng != nil
|
72
|
-
ldata["latlng_lat"] = latlng[0]
|
73
|
-
ldata["latlng_lng"] = latlng[1]
|
66
|
+
if data.class == Hash
|
67
|
+
data = [ data ]
|
68
|
+
elsif data.length == 0
|
69
|
+
return
|
74
70
|
end
|
75
|
-
return ScraperWiki.save_sqlite(unique_keys, ldata, table_name="swdata", verbose=2)
|
76
|
-
end
|
77
|
-
|
78
71
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
else
|
84
|
-
username = 'resourcedir' # gets it into the right subdirectory automatically!!!
|
85
|
-
dirscrapername = ds.m_scrapername
|
86
|
-
if ds.m_scrapername == '' or ds.m_scrapername.nil?
|
87
|
-
dirscrapername = 'DRAFT__' + ds.m_runid.gsub(/[\.\-]/, '_')
|
88
|
-
end
|
89
|
-
path = "%s/%s" % [username, dirscrapername]
|
90
|
-
|
91
|
-
record = {"query"=>sqlquery, "params"=>data, "attach"=>[]}
|
92
|
-
$attachlist.each do |value|
|
93
|
-
record["attach"].push({"user"=>username, "database"=>value["name"], "alias"=>value["asattach"], "securityhash"=>"somthing"})
|
94
|
-
end
|
95
|
-
|
96
|
-
httpcall = Net::HTTP.new(ds.m_host, ds.m_webstore_port)
|
97
|
-
headers = { "Accept"=>"application/json+tuples", "X-Scrapername"=>ds.m_scrapername, "X-Runid"=>ds.m_runid, "Content-Type"=>"application/json" }
|
98
|
-
response = httpcall.put(path, JSON.generate(record), headers)
|
99
|
-
res = JSON.parse(response.body)
|
100
|
-
if res["state"] == "error"
|
101
|
-
ScraperWiki.raisesqliteerror(res["message"])
|
102
|
-
end
|
103
|
-
if (res.class == Hash) and (res["keys"].class == Array) and (res["data"].class == Array)
|
104
|
-
if res["keys"].include?("state") and (res["data"].length == 1)
|
105
|
-
ddata = Hash[*res["keys"].zip(res["data"][0]).flatten]
|
106
|
-
if ddata["state"] == "error"
|
107
|
-
ScraperWiki.raisesqliteerror(ddata["message"])
|
108
|
-
end
|
109
|
-
end
|
110
|
-
end
|
111
|
-
end
|
72
|
+
rjdata = [ ]
|
73
|
+
for ldata in data
|
74
|
+
ljdata = _convdata(unique_keys, ldata)
|
75
|
+
rjdata.push(ljdata)
|
112
76
|
|
113
|
-
if verbose
|
114
|
-
if data.kind_of?(Array)
|
115
|
-
data.each do |value|
|
116
|
-
ldata = [ ]
|
117
|
-
if value == nil
|
118
|
-
value = ''
|
119
|
-
end
|
120
|
-
ldata.push(ScraperWiki._unicode_truncate(value.to_s, 50))
|
121
|
-
end
|
122
|
-
else
|
123
|
-
ldata = data
|
124
|
-
end
|
125
|
-
ScraperWiki.dumpMessage({'message_type'=>'sqlitecall', 'command'=>"execute", 'val1'=>sqlquery, 'val2'=>ldata})
|
126
77
|
end
|
127
|
-
return res
|
128
|
-
end
|
129
|
-
|
130
78
|
|
79
|
+
SQLiteMagic._do_save_sqlite(unique_keys, rjdata, table_name)
|
80
|
+
end
|
131
81
|
|
132
|
-
|
82
|
+
# Internal function to check a row of data, convert to right format
|
133
83
|
def ScraperWiki._convdata(unique_keys, scraper_data)
|
134
84
|
if unique_keys
|
135
85
|
for key in unique_keys
|
136
86
|
if !key.kind_of?(String) and !key.kind_of?(Symbol)
|
137
|
-
return
|
87
|
+
return 'unique_keys must each be a string or a symbol, this one is not: ' + key
|
138
88
|
end
|
139
89
|
if !scraper_data.include?(key) and !scraper_data.include?(key.to_sym)
|
140
|
-
return
|
90
|
+
return 'unique_keys must be a subset of data, this one is not: ' + key
|
141
91
|
end
|
142
92
|
if scraper_data[key] == nil and scraper_data[key.to_sym] == nil
|
143
|
-
return
|
93
|
+
return 'unique_key value should not be nil, this one is nil: ' + key
|
144
94
|
end
|
145
95
|
end
|
146
96
|
end
|
147
97
|
|
148
98
|
jdata = { }
|
149
99
|
scraper_data.each_pair do |key, value|
|
150
|
-
if not key
|
151
|
-
return { "error" => 'key must not be blank', "bad_key" => key }
|
152
|
-
end
|
153
|
-
if key.kind_of?(Symbol)
|
154
|
-
key = key.to_s
|
155
|
-
end
|
156
|
-
if key.class != String
|
157
|
-
return { "error" => 'key must be string type', "bad_key" => key }
|
158
|
-
end
|
100
|
+
raise 'key must not have blank name' if not key
|
159
101
|
|
160
|
-
if
|
161
|
-
|
162
|
-
|
102
|
+
key = key.to_s if key.kind_of?(Symbol)
|
103
|
+
raise 'key must be string or symbol type: ' + key if key.class != String
|
104
|
+
raise 'key must be simple text: ' + key if !/[a-zA-Z0-9_\- ]+$/.match(key)
|
163
105
|
|
164
|
-
|
106
|
+
# convert formats
|
107
|
+
if value.kind_of?(Date)
|
165
108
|
value = value.iso8601
|
166
109
|
end
|
167
110
|
if value.kind_of?(Time)
|
168
111
|
value = value.iso8601
|
169
|
-
raise "internal error, timezone came out as non-UTC while converting to SQLite format" unless value.match(
|
170
|
-
value.gsub!(
|
112
|
+
raise "internal error, timezone came out as non-UTC while converting to SQLite format" unless value.match(/([+-]00:00|Z)$/)
|
113
|
+
value.gsub!(/([+-]00:00|Z)$/, '')
|
171
114
|
end
|
172
115
|
if ![Fixnum, Float, String, TrueClass, FalseClass, NilClass].include?(value.class)
|
173
116
|
value = value.to_s
|
174
117
|
end
|
118
|
+
|
175
119
|
jdata[key] = value
|
176
120
|
end
|
177
121
|
return jdata
|
178
122
|
end
|
179
123
|
|
180
|
-
|
181
|
-
def ScraperWiki.save_sqlite(unique_keys, data, table_name="swdata", verbose=2)
|
182
|
-
if !data
|
183
|
-
ScraperWiki.dumpMessage({'message_type' => 'data', 'content' => "EMPTY SAVE IGNORED"})
|
184
|
-
return
|
185
|
-
end
|
186
|
-
|
187
|
-
# convert :symbols to "strings"
|
188
|
-
unique_keys = unique_keys.map { |x| x.kind_of?(Symbol) ? x.to_s : x }
|
189
|
-
|
190
|
-
if data.class == Hash
|
191
|
-
data = [ data ]
|
192
|
-
end
|
193
|
-
|
194
|
-
rjdata = [ ]
|
195
|
-
for ldata in data
|
196
|
-
ljdata = _convdata(unique_keys, ldata)
|
197
|
-
if ljdata.include?("error")
|
198
|
-
raise SqliteException.new(ljdata["error"])
|
199
|
-
end
|
200
|
-
rjdata.push(ljdata)
|
201
|
-
end
|
202
|
-
|
203
|
-
ds = SW_DataStore.create()
|
204
|
-
if ds.m_webstore_port == 0
|
205
|
-
res = ds.request({'maincommand'=>'save_sqlite', 'unique_keys'=>unique_keys, 'data'=>rjdata, 'swdatatblname'=>table_name})
|
206
|
-
else
|
207
|
-
username = 'resourcedir' # gets it into the right subdirectory automatically!!!
|
208
|
-
dirscrapername = ds.m_scrapername
|
209
|
-
if ds.m_scrapername == '' or ds.m_scrapername.nil?
|
210
|
-
dirscrapername = 'DRAFT__' + ds.m_runid.gsub(/[\.\-]/, '_')
|
211
|
-
end
|
212
|
-
|
213
|
-
# (do something about jargtypes later)
|
214
|
-
qsl = [ ]
|
215
|
-
unique_keys.each do |key|
|
216
|
-
qsl.push("unique="+URI.encode(key))
|
217
|
-
end
|
218
|
-
|
219
|
-
# quick and dirty provision of column types to the webstore
|
220
|
-
if rjdata.length != 0
|
221
|
-
jargtypes = { }
|
222
|
-
rjdata[0].each_pair do |k, v|
|
223
|
-
if v != nil
|
224
|
-
#if k[-5..-1] == "_blob"
|
225
|
-
# vt = "blob" # coerced into affinity none
|
226
|
-
if v.class == Fixnum
|
227
|
-
vt = "integer"
|
228
|
-
elsif v.class == Float
|
229
|
-
vt = "real"
|
230
|
-
else
|
231
|
-
vt = "text"
|
232
|
-
end
|
233
|
-
jargtypes[k] = vt
|
234
|
-
end
|
235
|
-
end
|
236
|
-
qsl.push(("jargtypes="+JSON.generate(jargtypes)))
|
237
|
-
end
|
238
|
-
|
239
|
-
path = "%s/%s/%s?%s" % [username, dirscrapername, table_name, qsl.join("&")]
|
240
|
-
#puts JSON.generate(rjdata)
|
241
|
-
httpcall = Net::HTTP.new(ds.m_host, ds.m_webstore_port)
|
242
|
-
headers = { "Accept"=>"application/json", "X-Scrapername"=>ds.m_scrapername, "X-Runid"=>ds.m_runid, "Content-Type"=>"application/json" }
|
243
|
-
response = httpcall.post(path, JSON.generate(rjdata), headers)
|
244
|
-
#puts response.body
|
245
|
-
res = JSON.parse(response.body)
|
246
|
-
if res["state"] == "error"
|
247
|
-
res["error"] = res["message"]
|
248
|
-
end
|
249
|
-
end
|
250
|
-
|
251
|
-
|
252
|
-
if res["error"]
|
253
|
-
raise SqliteException.new(res["error"])
|
254
|
-
end
|
255
|
-
|
256
|
-
if verbose >= 2
|
257
|
-
pdata = { }
|
258
|
-
if rjdata.class == Hash
|
259
|
-
sdata = rjdata
|
260
|
-
else
|
261
|
-
sdata = rjdata[0]
|
262
|
-
end
|
263
|
-
sdata.each_pair do |key, value|
|
264
|
-
key = ScraperWiki._unicode_truncate(key.to_s, 50)
|
265
|
-
if value == nil
|
266
|
-
value = ''
|
267
|
-
else
|
268
|
-
value = ScraperWiki._unicode_truncate(value.to_s, 50)
|
269
|
-
end
|
270
|
-
pdata[key] = String(value)
|
271
|
-
end
|
272
|
-
if rjdata.class == Array and rjdata.size > 1
|
273
|
-
pdata["number_records"] = "Number Records: "+String(rjdata.size)
|
274
|
-
end
|
275
|
-
ScraperWiki.dumpMessage({'message_type' => 'data', 'content' => pdata})
|
276
|
-
end
|
277
|
-
return res
|
278
|
-
end
|
279
|
-
|
280
|
-
# also needs to handle the types better (could save json and datetime objects handily
|
281
|
-
def ScraperWiki.save_var(name, value, verbose=2)
|
282
|
-
vtype = String(value.class)
|
283
|
-
svalue = value.to_s
|
284
|
-
if vtype != "Fixnum" and vtype != "String" and vtype != "Float" and vtype != "NilClass"
|
285
|
-
puts "*** object of type "+vtype+" converted to string\n"
|
286
|
-
end
|
287
|
-
data = { "name" => name, "value_blob" => svalue, "type" => vtype }
|
288
|
-
ScraperWiki.save_sqlite(unique_keys=["name"], data=data, table_name="swvariables", verbose=verbose)
|
289
|
-
end
|
290
|
-
|
291
|
-
def ScraperWiki.get_var(name, default=nil, verbose=2)
|
292
|
-
begin
|
293
|
-
result = ScraperWiki.sqliteexecute("select value_blob, type from swvariables where name=?", [name], verbose)
|
294
|
-
rescue NoSuchTableSqliteException => e
|
295
|
-
return default
|
296
|
-
end
|
297
|
-
|
298
|
-
if !result.has_key?("data")
|
299
|
-
return default
|
300
|
-
end
|
301
|
-
|
302
|
-
if result["data"].length == 0
|
303
|
-
return default
|
304
|
-
end
|
305
|
-
# consider casting to type
|
306
|
-
svalue = result["data"][0][0]
|
307
|
-
vtype = result["data"][0][1]
|
308
|
-
if vtype == "Fixnum"
|
309
|
-
return svalue.to_i
|
310
|
-
end
|
311
|
-
if vtype == "Float"
|
312
|
-
return svalue.to_f
|
313
|
-
end
|
314
|
-
if vtype == "NilClass"
|
315
|
-
return nil
|
316
|
-
end
|
317
|
-
return svalue
|
318
|
-
end
|
319
|
-
|
320
|
-
# These are DEPRECATED and just here for compatibility
|
321
|
-
def ScraperWiki.get_metadata(metadata_name, default = nil)
|
322
|
-
if !$metadatamessagedone == nil
|
323
|
-
puts "*** instead of get_metadata('"+metadata_name+"') please use\n get_var('"+metadata_name+"')"
|
324
|
-
metadatamessagedone = true
|
325
|
-
end
|
326
|
-
result = ScraperWiki.get_var(metadata_name, default)
|
327
|
-
return result
|
328
|
-
end
|
329
|
-
|
330
|
-
# These are DEPRECATED and just here for compatibility
|
331
|
-
def ScraperWiki.save_metadata(metadata_name, value)
|
332
|
-
if !$metadatamessagedone
|
333
|
-
puts "*** instead of save_metadata('"+metadata_name+"') please use\n save_var('"+metadata_name+"')"
|
334
|
-
$metadatamessagedone = true
|
335
|
-
end
|
336
|
-
return ScraperWiki.save_var(metadata_name, value)
|
337
|
-
end
|
338
|
-
|
339
|
-
|
340
|
-
def ScraperWiki.show_tables(dbname=nil)
|
341
|
-
name = "sqlite_master"
|
342
|
-
if dbname != nil
|
343
|
-
name = "`"+dbname+"`.sqlite_master"
|
344
|
-
end
|
345
|
-
result = ScraperWiki.sqliteexecute("select tbl_name, sql from "+name+" where type='table'")
|
346
|
-
#return result["data"]
|
347
|
-
return (Hash[*result["data"].flatten]) # pre-1.8.7
|
348
|
-
end
|
349
|
-
|
350
|
-
|
351
|
-
def ScraperWiki.table_info(name)
|
352
|
-
sname = name.split(".")
|
353
|
-
if sname.length == 2
|
354
|
-
result = ScraperWiki.sqliteexecute("PRAGMA %s.table_info(`%s`)" % sname)
|
355
|
-
else
|
356
|
-
result = ScraperWiki.sqliteexecute("PRAGMA table_info(`%s`)" % name)
|
357
|
-
end
|
358
|
-
res = [ ]
|
359
|
-
for d in result["data"]
|
360
|
-
res.push(Hash[*result["keys"].zip(d).flatten]) # pre-1.8.7
|
361
|
-
end
|
362
|
-
return res
|
363
|
-
end
|
364
|
-
|
365
|
-
|
366
|
-
def ScraperWiki.getDataByDate(name, start_date, end_date, limit=-1, offset=0)
|
367
|
-
raise SqliteException.new("getDataByDate has been deprecated")
|
368
|
-
end
|
369
|
-
|
370
|
-
def ScraperWiki.getDataByLocation(name, lat, lng, limit=-1, offset=0)
|
371
|
-
raise SqliteException.new("getDataByLocation has been deprecated")
|
372
|
-
end
|
373
|
-
|
374
|
-
def ScraperWiki.search(name, filterdict, limit=-1, offset=0)
|
375
|
-
raise SqliteException.new("SW_APIWrapper.search has been deprecated")
|
376
|
-
end
|
377
|
-
|
378
|
-
def ScraperWiki.raisesqliteerror(rerror)
|
379
|
-
if /sqlite3.Error: no such table:/.match(rerror) # old dataproxy
|
380
|
-
raise NoSuchTableSqliteException.new(rerror)
|
381
|
-
end
|
382
|
-
if /DB Error: \(OperationalError\) no such table:/.match(rerror)
|
383
|
-
raise NoSuchTableSqliteException.new(rerror)
|
384
|
-
end
|
385
|
-
raise SqliteException.new(rerror)
|
386
|
-
end
|
387
|
-
|
388
|
-
def ScraperWiki.attach(name, asname=nil, verbose=1)
|
389
|
-
$attachlist.push({"name"=>name, "asname"=>asname})
|
390
|
-
|
391
|
-
ds = SW_DataStore.create()
|
392
|
-
|
393
|
-
if ds.m_webstore_port == 0
|
394
|
-
res = ds.request({'maincommand'=>'sqlitecommand', 'command'=>"attach", 'name'=>name, 'asname'=>asname})
|
395
|
-
if res["error"]
|
396
|
-
ScraperWiki.raisesqliteerror(res)
|
397
|
-
end
|
398
|
-
else
|
399
|
-
res = {'status'=>'ok'}
|
400
|
-
end
|
401
|
-
|
402
|
-
if verbose
|
403
|
-
ScraperWiki.dumpMessage({'message_type'=>'sqlitecall', 'command'=>"attach", 'val1'=>name, 'val2'=>asname})
|
404
|
-
end
|
405
|
-
|
406
|
-
return res
|
407
|
-
end
|
408
|
-
|
409
|
-
|
410
|
-
def ScraperWiki.commit(verbose=1)
|
411
|
-
ds = SW_DataStore.create()
|
412
|
-
if ds.m_webstore_port == 0
|
413
|
-
res = ds.request({'maincommand'=>'sqlitecommand', 'command'=>"commit"})
|
414
|
-
else
|
415
|
-
puts "*** commit() no longer a necessary function call"
|
416
|
-
res = {'status'=>'ok'}
|
417
|
-
end
|
418
|
-
end
|
419
|
-
|
420
|
-
def ScraperWiki.select(sqlquery, data=nil, verbose=1)
|
421
|
-
if data != nil && sqlquery.scan(/\?/).length != 0 && data.class != Array
|
422
|
-
data = [data]
|
423
|
-
end
|
424
|
-
result = ScraperWiki.sqliteexecute("select "+sqlquery, data, verbose)
|
425
|
-
res = [ ]
|
426
|
-
for d in result["data"]
|
427
|
-
#res.push(Hash[result["keys"].zip(d)]) # post-1.8.7
|
428
|
-
res.push(Hash[*result["keys"].zip(d).flatten]) # pre-1.8.7
|
429
|
-
end
|
430
|
-
return res
|
431
|
-
end
|
432
|
-
|
433
|
-
# old functions put back in for regression
|
434
|
-
def ScraperWiki.getData(name, limit=-1, offset=0)
|
435
|
-
if !$apiwrapperattacheddata.include?(name)
|
436
|
-
puts "*** instead of getData('"+name+"') please use\n ScraperWiki.attach('"+name+"') \n print ScraperWiki.select('* from `"+name+"`.swdata')"
|
437
|
-
ScraperWiki.attach(name)
|
438
|
-
$apiwrapperattacheddata.push(name)
|
439
|
-
end
|
440
|
-
|
441
|
-
apilimit = 500
|
442
|
-
g = Enumerator.new do |g|
|
443
|
-
count = 0
|
444
|
-
while true
|
445
|
-
if limit == -1
|
446
|
-
step = apilimit
|
447
|
-
else
|
448
|
-
step = apilimit < (limit - count) ? apilimit : limit - count
|
449
|
-
end
|
450
|
-
query = "* from `#{name}`.swdata limit #{step} offset #{offset+count}"
|
451
|
-
|
452
|
-
records = ScraperWiki.select(query)
|
453
|
-
for r in records
|
454
|
-
g.yield r
|
455
|
-
end
|
456
|
-
|
457
|
-
count += records.length
|
458
|
-
if records.length < step
|
459
|
-
break
|
460
|
-
end
|
461
|
-
if limit != -1 and count >= limit
|
462
|
-
break
|
463
|
-
end
|
464
|
-
end
|
465
|
-
end
|
466
|
-
end
|
467
|
-
|
468
|
-
def ScraperWiki.getKeys(name)
|
469
|
-
if !$apiwrapperattacheddata.include?(name)
|
470
|
-
puts "*** instead of getKeys('"+name+"') please use\n ScraperWiki.attach('"+name+"') \n print ScraperWiki.sqliteexecute('select * from `"+name+"`.swdata limit 0')['keys']"
|
471
|
-
ScraperWiki.attach(name)
|
472
|
-
$apiwrapperattacheddata.push(name)
|
473
|
-
end
|
474
|
-
result = ScraperWiki.sqliteexecute("select * from `"+name+"`.swdata limit 0")
|
475
|
-
if result.include?("error")
|
476
|
-
raise SqliteException.new(result["error"])
|
477
|
-
end
|
478
|
-
return result["keys"]
|
479
|
-
end
|
480
124
|
end
|
metadata
CHANGED
@@ -1,98 +1,45 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraperwiki
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 2.0.0
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 1
|
8
|
-
- 0
|
9
|
-
- 1
|
10
|
-
version: 1.0.1
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
13
|
-
- Francis
|
7
|
+
authors:
|
8
|
+
- Francis irving
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
- !ruby/object:Gem::Dependency
|
22
|
-
name: json
|
23
|
-
prerelease: false
|
24
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
|
-
requirements:
|
27
|
-
- - ">="
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
hash: 3
|
30
|
-
segments:
|
31
|
-
- 0
|
32
|
-
version: "0"
|
33
|
-
type: :runtime
|
34
|
-
version_requirements: *id001
|
35
|
-
- !ruby/object:Gem::Dependency
|
36
|
-
name: httpclient
|
37
|
-
prerelease: false
|
38
|
-
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
-
none: false
|
40
|
-
requirements:
|
41
|
-
- - ">="
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
hash: 3
|
44
|
-
segments:
|
45
|
-
- 0
|
46
|
-
version: "0"
|
47
|
-
type: :runtime
|
48
|
-
version_requirements: *id002
|
49
|
-
description: Ruby code used for accessing
|
50
|
-
email:
|
51
|
-
- francis@scraperwiki.com
|
12
|
+
date: 2013-04-04 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: A library for scraping web pages and saving data easily
|
15
|
+
email: francis@scraperwiki.com
|
52
16
|
executables: []
|
53
|
-
|
54
17
|
extensions: []
|
55
|
-
|
56
18
|
extra_rdoc_files: []
|
57
|
-
|
58
|
-
files:
|
59
|
-
- lib/version.rb
|
19
|
+
files:
|
60
20
|
- lib/scraperwiki.rb
|
61
|
-
|
62
|
-
- lib/scraperwiki/stacktrace.rb
|
63
|
-
has_rdoc: true
|
64
|
-
homepage: http://scraperwiki.com
|
21
|
+
homepage: http://rubygems.org/gems/scraperwiki
|
65
22
|
licenses: []
|
66
|
-
|
67
23
|
post_install_message:
|
68
24
|
rdoc_options: []
|
69
|
-
|
70
|
-
require_paths:
|
25
|
+
require_paths:
|
71
26
|
- lib
|
72
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
27
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
73
28
|
none: false
|
74
|
-
requirements:
|
75
|
-
- -
|
76
|
-
- !ruby/object:Gem::Version
|
77
|
-
|
78
|
-
|
79
|
-
- 0
|
80
|
-
version: "0"
|
81
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
34
|
none: false
|
83
|
-
requirements:
|
84
|
-
- -
|
85
|
-
- !ruby/object:Gem::Version
|
86
|
-
|
87
|
-
segments:
|
88
|
-
- 0
|
89
|
-
version: "0"
|
35
|
+
requirements:
|
36
|
+
- - ! '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
90
39
|
requirements: []
|
91
|
-
|
92
40
|
rubyforge_project:
|
93
|
-
rubygems_version: 1.
|
41
|
+
rubygems_version: 1.8.23
|
94
42
|
signing_key:
|
95
43
|
specification_version: 3
|
96
|
-
summary: ScraperWiki
|
44
|
+
summary: ScraperWiki
|
97
45
|
test_files: []
|
98
|
-
|
@@ -1,109 +0,0 @@
|
|
1
|
-
require 'json'
|
2
|
-
require 'singleton'
|
3
|
-
require 'thread'
|
4
|
-
require 'cgi'
|
5
|
-
|
6
|
-
# the python version of this makes use of a global static copy of the class
|
7
|
-
# so the connection is made only once to the dataproxy
|
8
|
-
# I think the Singleton module implements this magically
|
9
|
-
|
10
|
-
class SW_DataStore
|
11
|
-
|
12
|
-
@@lock = Mutex.new
|
13
|
-
|
14
|
-
include Singleton
|
15
|
-
|
16
|
-
attr_accessor :m_port, :m_host, :m_scrapername, :m_runid, :m_attachables, :m_webstore_port
|
17
|
-
|
18
|
-
def initialize
|
19
|
-
@m_socket = nil
|
20
|
-
@m_host = nil
|
21
|
-
@m_port = nil
|
22
|
-
@m_scrapername = ''
|
23
|
-
@m_runid = ''
|
24
|
-
@m_attachables = []
|
25
|
-
@webstore_port = 0
|
26
|
-
end
|
27
|
-
|
28
|
-
|
29
|
-
def ensure_connected
|
30
|
-
# Connect to the data proxy. The data proxy will need to make an Ident call
|
31
|
-
# back to get the scraperID. Since the data proxy may be on another machine
|
32
|
-
# and the peer address it sees will have been subject to NAT or masquerading,
|
33
|
-
# send the UML name and the socket port number in the request.
|
34
|
-
|
35
|
-
if @m_socket == nil
|
36
|
-
@m_socket = TCPSocket.open(@m_host, @m_port)
|
37
|
-
proto, port, name, ip = @m_socket.addr()
|
38
|
-
if @m_scrapername == '' or @m_scrapername.nil?
|
39
|
-
sname = ''
|
40
|
-
else
|
41
|
-
sname = CGI::escape(@m_scrapername)
|
42
|
-
end
|
43
|
-
if @m_runid == '' or @m_runid.nil?
|
44
|
-
rid = ''
|
45
|
-
else
|
46
|
-
rid = CGI::escape(@m_runid)
|
47
|
-
end
|
48
|
-
|
49
|
-
getmsg = "GET /?uml=%s&port=%s&vscrapername=%s&vrunid=%s HTTP/1.1\n\n" % ['lxc', port, sname, rid]
|
50
|
-
@m_socket.send(getmsg, 0)
|
51
|
-
@m_socket.flush()
|
52
|
-
|
53
|
-
buffer = @m_socket.recv(1024)
|
54
|
-
result = JSON.parse(buffer)
|
55
|
-
if result["status"] != "good"
|
56
|
-
raise result["status"]
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
def request (req)
|
62
|
-
text = ''
|
63
|
-
@@lock.synchronize {
|
64
|
-
ensure_connected
|
65
|
-
reqmsg = JSON.generate(req) + "\n"
|
66
|
-
|
67
|
-
bytes_sent = 0
|
68
|
-
while bytes_sent < reqmsg.length
|
69
|
-
bytes_sent += @m_socket.send(reqmsg.slice(bytes_sent, reqmsg.length), 0)
|
70
|
-
end
|
71
|
-
@m_socket.flush()
|
72
|
-
|
73
|
-
while true
|
74
|
-
buffer = @m_socket.recv(1024)
|
75
|
-
if buffer.length == 0
|
76
|
-
break
|
77
|
-
end
|
78
|
-
text += buffer
|
79
|
-
if text[-1] == "\n"[0]
|
80
|
-
break
|
81
|
-
end
|
82
|
-
end
|
83
|
-
}
|
84
|
-
return JSON.parse(text)
|
85
|
-
end
|
86
|
-
|
87
|
-
# function used to both initialize the settings and get an instance!
|
88
|
-
# this is ridiculous and unnecessary with new webstore.
|
89
|
-
# we are creating object without the fields merely to access the static variables!
|
90
|
-
def SW_DataStore.create(host=nil, port = nil, scrapername = '', runid = nil, attachables = nil, webstore_port = nil)
|
91
|
-
instance = SW_DataStore.instance
|
92
|
-
# so, it might be intended that the host and port are
|
93
|
-
# set once, never to be changed, but this is ruby so
|
94
|
-
# there's no way to guarantee that.
|
95
|
-
if host && port && instance.m_port.nil? && instance.m_host.nil?
|
96
|
-
instance.m_host = host
|
97
|
-
instance.m_port = port
|
98
|
-
instance.m_scrapername = scrapername
|
99
|
-
instance.m_runid = runid
|
100
|
-
instance.m_attachables = attachables
|
101
|
-
instance.m_webstore_port = webstore_port
|
102
|
-
elsif host && port
|
103
|
-
raise "Can't change host and port once connection made"
|
104
|
-
elsif !(instance.m_port) || !(instance.m_host)
|
105
|
-
raise "Can't return a datastore without port/host information"
|
106
|
-
end
|
107
|
-
instance
|
108
|
-
end
|
109
|
-
end
|
@@ -1,51 +0,0 @@
|
|
1
|
-
def _get_stackentry(code_filename, code, filename, linenumber, funcname)
|
2
|
-
nlinenumber = linenumber.to_i
|
3
|
-
stackentry = {"file" => filename, "linenumber" => nlinenumber, "duplicates" => 1}
|
4
|
-
|
5
|
-
if filename == "(eval)" or filename == code_filename
|
6
|
-
codelines = code.split("\n")
|
7
|
-
if (nlinenumber >= 1) && (nlinenumber <= codelines.size)
|
8
|
-
stackentry["linetext"] = codelines[nlinenumber-1]
|
9
|
-
elsif (nlinenumber == codelines.size + 1)
|
10
|
-
stackentry["linetext"] = "<end of file>"
|
11
|
-
else
|
12
|
-
stackentry["linetext"] = "getExceptionTraceback: ScraperWiki internal error, line %d out of range in file %s" % [nlinenumber, code_filename]
|
13
|
-
end
|
14
|
-
stackentry["file"] = "<string>"
|
15
|
-
else
|
16
|
-
# XXX bit of a hack to show the line number in third party libraries
|
17
|
-
stackentry["file"] += ":" + linenumber
|
18
|
-
end
|
19
|
-
if funcname
|
20
|
-
stackentry["furtherlinetext"] = funcname
|
21
|
-
end
|
22
|
-
return stackentry
|
23
|
-
end
|
24
|
-
|
25
|
-
def getExceptionTraceback(e, code, code_filename)
|
26
|
-
lbacktrace = e.backtrace.reverse
|
27
|
-
#File.open("/tmp/fairuby", 'a') {|f| f.write(JSON.generate(lbacktrace)) }
|
28
|
-
|
29
|
-
exceptiondescription = e.to_s
|
30
|
-
|
31
|
-
stackdump = []
|
32
|
-
for l in lbacktrace
|
33
|
-
(filename, linenumber, funcname) = l.split(":")
|
34
|
-
|
35
|
-
next if filename.match(/\/exec.rb$/) # skip showing stack of wrapper
|
36
|
-
|
37
|
-
stackentry = _get_stackentry(code_filename, code, filename, linenumber, funcname)
|
38
|
-
stackdump.push(stackentry)
|
39
|
-
end
|
40
|
-
|
41
|
-
if e.kind_of?(SyntaxError)
|
42
|
-
(filename, linenumber, message) = exceptiondescription.split(/[:\n]/, 3)
|
43
|
-
exceptiondescription = message
|
44
|
-
|
45
|
-
stackentry = _get_stackentry(code_filename, code, filename, linenumber, nil)
|
46
|
-
stackdump.push(stackentry)
|
47
|
-
end
|
48
|
-
|
49
|
-
return { 'message_type' => 'exception', 'exceptiondescription' => exceptiondescription, "stackdump" => stackdump }
|
50
|
-
end
|
51
|
-
|
data/lib/version.rb
DELETED