scraperwiki 2.0.6 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ *.sw[po]
3
+ .DS_Store
4
+ Gemfile.lock
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ # A sample Gemfile
2
+ source "https://rubygems.org"
3
+
4
+ # Specify your gem's spec
5
+ gemspec
6
+
data/LICENCE ADDED
@@ -0,0 +1,8 @@
1
+ Copyright (c) 2013, ScraperWiki Limited
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
5
+
6
+ Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7
+ Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
8
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,90 @@
1
+ # ScraperWiki Ruby library [![Build Status](https://travis-ci.org/openc/scraperwiki-ruby.png)](https://travis-ci.org/openc/scraperwiki-ruby)
2
+
3
+ This is a Ruby library for scraping web pages and saving data. It is a fork/rewrite of the original [scraperwiki-ruby](https://github.com/scraperwiki/scraperwiki-ruby) gem, extracting the SQLite utility methods into the [sqlite_magic](https://github.com/openc/sqlite_magic) gem.
4
+
5
+ It is a work in progress (for example, it doesn't yet create indices automatically), but should allow ScraperWiki classic scripts to be run locally.
6
+
7
+ ## Installing
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'scraperwiki-ruby', :git => 'git@github.com:openc/sqlite_magic.git'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ ## Scraping
18
+
19
+ ### ScraperWiki.scrape(url[, params])
20
+
21
+ Returns the downloaded string from the given *url*. *params* are sent as a POST if set.
22
+
23
+ ### Saving data
24
+
25
+ Helper functions for saving and querying an SQL database. Updates the schema
26
+ automatically according to the data you save.
27
+
28
+ Currently only supports SQLite. It will make a local SQLite database.
29
+
30
+ ### ScraperWiki.save\_sqlite(unique\_keys, data[, table\_name = "swdata"],verbose)
31
+
32
+ Saves a data record into the datastore into the table given
33
+ by *table_name*.
34
+
35
+ *data* is a hash with field names as keys (can be strings or symbols).
36
+
37
+ *unique_keys* is a subset of data.keys() which determines when a record is
38
+ overwritten.
39
+
40
+ For large numbers of records *data* can be an array of hashes.
41
+
42
+ *verbose*, kept for smooth migration from classic, doesn't do anything yet.
43
+
44
+ ### ScraperWiki.sqliteexecute(query,[params],verbose)
45
+
46
+ Executes provided query with the parameters against the database and returns the results in key value pairs
47
+
48
+ *query* is a sql statement
49
+
50
+ *params*, if prepared statement will contains an array of values
51
+
52
+ ### ScraperWiki.save\_var(name,value,verbose)
53
+
54
+ Allows the user to save a single variable (at a time) to carry state across runs of the scraper.
55
+
56
+ *name*, the variable name
57
+
58
+ *value*, the value of the variable
59
+
60
+ *verbose*, verbosity level
61
+
62
+ ### ScraperWiki.get\_var(name,default,verbose)
63
+
64
+ Allows the user to retrieve a previously saved variable
65
+
66
+ *name*, The variable name to fetch
67
+
68
+ *value*, The value to use if the variable name is not found
69
+
70
+ *verbose*, verbosity level
71
+
72
+ ### ScraperWiki.select(partial\_query,[params],verbose)
73
+
74
+ Allows for a simplified select statement
75
+
76
+ *partial_query*, A valid select statement, without the select keyword
77
+
78
+ *params* Any data provided for ? replacements in the query
79
+
80
+ *verbose*, verbosity level
81
+
82
+ ## Usage
83
+
84
+ Run your Ruby scraper and any data saved will be put in an SQLite database in the current directory called `scraperwiki.sqlite`.
85
+
86
+ If you're using scrapers from ScraperWiki Classic, remember to add `require 'scraperwiki'` to your file if it's not already there.
87
+
88
+ ## Development
89
+
90
+ You need the `sqlite3` program installed to run tests. To install run `sudo apt-get install sqlite3` on Ubuntu.
@@ -0,0 +1,8 @@
1
+ # require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << 'test'
6
+ end
7
+
8
+ task :default => :test
@@ -1,234 +1,181 @@
1
1
  require 'httpclient'
2
- require 'sqlite3'
3
2
  $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__)))
4
- require 'scraperwiki/sqlite_save_info.rb'
5
-
6
- class SqliteException < RuntimeError
7
- end
3
+ # require 'scraperwiki/sqlite_save_info.rb'
4
+ require 'scraperwiki/version.rb'
5
+ require 'sqlite_magic'
8
6
 
9
7
  module ScraperWiki
10
-
11
- # The scrape method fetches the content from a webserver.
12
- #
13
- # === Parameters
14
- #
15
- # * _url_ = The URL to fetch
16
- # * _params_ = The parameters to send with a POST request
17
- # * _agent = A manually supplied useragent string
18
- #
19
- # === Example
20
- # ScraperWiki::scrape('http://scraperwiki.com')
21
- #
22
- def ScraperWiki.scrape(url, params = nil, agent = nil)
23
- if agent
24
- client = HTTPClient.new(:agent_name => agent)
25
- else
26
- client = HTTPClient.new
27
- end
28
- client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
29
- if HTTPClient.respond_to?("client.transparent_gzip_decompression=")
30
- client.transparent_gzip_decompression = true
31
- end
32
-
33
- if params.nil?
34
- html = client.get_content(url)
35
- else
36
- html = client.post_content(url, params)
37
- end
38
-
39
- unless HTTPClient.respond_to?("client.transparent_gzip_decompression=")
40
- begin
41
- gz = Zlib::GzipReader.new(StringIO.new(html))
42
- return gz.read
43
- rescue
44
- return html
45
- end
46
- end
8
+ extend self
9
+
10
+ # The scrape method fetches the content from a webserver.
11
+ #
12
+ # === Parameters
13
+ #
14
+ # * _url_ = The URL to fetch
15
+ # * _params_ = The parameters to send with a POST request
16
+ # * _agent = A manually supplied useragent string
17
+ # NB This method hasn't been refactored or tested, but could
18
+ # prob do with both
19
+ #
20
+ # === Example
21
+ # ScraperWiki::scrape('http://scraperwiki.com')
22
+ #
23
+ def scrape(url, params = nil, agent = nil)
24
+ if agent
25
+ client = HTTPClient.new(:agent_name => agent)
26
+ else
27
+ client = HTTPClient.new
47
28
  end
48
-
49
- # Saves the provided data into a local database for this scraper. Data is upserted
50
- # into this table (inserted if it does not exist, updated if the unique keys say it
51
- # does).
52
- #
53
- # === Parameters
54
- #
55
- # * _unique_keys_ = A list of column names, that used together should be unique
56
- # * _data_ = A hash of the data where the Key is the column name, the Value the row
57
- # value. If sending lots of data this can be a list of hashes.
58
- # * _table_name_ = The name that the newly created table should use.
59
- #
60
- # === Example
61
- # ScraperWiki::save(['id'], {'id'=>1})
62
- #
63
- def ScraperWiki.save_sqlite(unique_keys, data, table_name="swdata",verbose=0)
64
- raise 'unique_keys must be nil or an array' if unique_keys != nil && !unique_keys.kind_of?(Array)
65
- raise 'data must have a non-nil value' if data == nil
66
-
67
- # convert :symbols to "strings"
68
- unique_keys = unique_keys.map { |x| x.kind_of?(Symbol) ? x.to_s : x }
69
-
70
- if data.class == Hash
71
- data = [ data ]
72
- elsif data.length == 0
73
- return
74
- end
75
-
76
- rjdata = [ ]
77
- for ldata in data
78
- ljdata = _convdata(unique_keys, ldata)
79
- rjdata.push(ljdata)
80
-
81
- end
82
-
83
- SQLiteMagic._do_save_sqlite(unique_keys, rjdata, table_name)
84
- end
85
-
86
- def ScraperWiki.sqliteexecute(query,data=nil, verbose=2)
87
- SQLiteMagic.sqliteexecute(query,data,verbose)
29
+ client.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE
30
+ if HTTPClient.respond_to?("client.transparent_gzip_decompression=")
31
+ client.transparent_gzip_decompression = true
88
32
  end
89
33
 
90
- def ScraperWiki.close_sqlite()
91
- SQLiteMagic.close
34
+ if params.nil?
35
+ html = client.get_content(url)
36
+ else
37
+ html = client.post_content(url, params)
92
38
  end
93
39
 
94
- # Internal function to check a row of data, convert to right format
95
- def ScraperWiki._convdata(unique_keys, scraper_data)
96
- if unique_keys
97
- for key in unique_keys
98
- if !key.kind_of?(String) and !key.kind_of?(Symbol)
99
- raise 'unique_keys must each be a string or a symbol, this one is not: ' + key
100
- end
101
- if !scraper_data.include?(key) and !scraper_data.include?(key.to_sym)
102
- raise 'unique_keys must be a subset of data, this one is not: ' + key
103
- end
104
- if scraper_data[key] == nil and scraper_data[key.to_sym] == nil
105
- raise 'unique_key value should not be nil, this one is nil: ' + key
106
- end
107
- end
108
- end
109
-
110
- jdata = { }
111
- scraper_data.each_pair do |key, value|
112
- raise 'key must not have blank name' if not key
113
-
114
- key = key.to_s if key.kind_of?(Symbol)
115
- raise 'key must be string or symbol type: ' + key if key.class != String
116
- raise 'key must be simple text: ' + key if !/[a-zA-Z0-9_\- ]+$/.match(key)
117
-
118
- # convert formats
119
- if value.kind_of?(Date)
120
- value = value.iso8601
121
- end
122
- if value.kind_of?(Time)
123
- value = value.iso8601
124
- raise "internal error, timezone came out as non-UTC while converting to SQLite format" unless value.match(/([+-]00:00|Z)$/)
125
- value.gsub!(/([+-]00:00|Z)$/, '')
126
- end
127
- if ![Fixnum, Float, String, TrueClass, FalseClass, NilClass].include?(value.class)
128
- value = value.to_s
129
- end
130
-
131
- jdata[key] = value
132
- end
133
- return jdata
40
+ unless HTTPClient.respond_to?("client.transparent_gzip_decompression=")
41
+ begin
42
+ gz = Zlib::GzipReader.new(StringIO.new(html))
43
+ return gz.read
44
+ rescue
45
+ return html
46
+ end
134
47
  end
135
-
136
- # Allows the user to retrieve a previously saved variable
137
- #
138
- # === Parameters
139
- #
140
- # * _name_ = The variable name to fetch
141
- # * _default_ = The value to use if the variable name is not found
142
- # * _verbose_ = Verbosity level
143
- #
144
- # === Example
145
- # ScraperWiki::get_var('current', 0)
146
- #
147
- def ScraperWiki.get_var(name, default=nil, verbose=2)
148
- begin
149
- result = ScraperWiki.sqliteexecute("select value_blob, type from swvariables where name=?", [name], verbose)
150
- rescue NoSuchTableSqliteException => e
151
- return default
152
- end
153
-
154
- if !result.has_key?("data")
155
- return default
156
- end
157
-
158
- if result["data"].length == 0
159
- return default
160
- end
161
- # consider casting to type
162
- svalue = result["data"][0][0]
163
- vtype = result["data"][0][1]
164
- if vtype == "Fixnum"
165
- return svalue.to_i
166
- end
167
- if vtype == "Float"
168
- return svalue.to_f
169
- end
170
- if vtype == "NilClass"
171
- return nil
172
- end
173
- return svalue
48
+ end
49
+
50
+ def convert_data(value_data)
51
+ return value_data if value_data.nil? or (value_data.respond_to?(:empty?) and value_data.empty?)
52
+ [value_data].flatten(1).collect do |datum_hash|
53
+ datum_hash.inject({}) do |hsh, (k,v)|
54
+ hsh[k] =
55
+ case v
56
+ when Date, DateTime
57
+ v.iso8601
58
+ when Time
59
+ # maintains existing ScraperWiki behaviour
60
+ v.iso8601.sub(/([+-]00:00|Z)$/, '')
61
+ else
62
+ v
63
+ end
64
+ hsh
65
+ end
174
66
  end
175
-
176
- # Allows the user to save a single variable (at a time) to carry state across runs of
177
- # the scraper.
178
- #
179
- # === Parameters
180
- #
181
- # * _name_ = The variable name
182
- # * _value_ = The value of the variable
183
- # * _verbose_ = Verbosity level
184
- #
185
- # === Example
186
- # ScraperWiki::save_var('current', 100)
187
- #
188
- def ScraperWiki.save_var(name, value, verbose=2)
189
- vtype = String(value.class)
190
- svalue = value.to_s
191
- if vtype != "Fixnum" and vtype != "String" and vtype != "Float" and vtype != "NilClass"
192
- puts "*** object of type "+vtype+" converted to string\n"
193
- end
194
- data = { "name" => name, "value_blob" => svalue, "type" => vtype }
195
- ScraperWiki.save_sqlite(unique_keys=["name"], data=data, table_name="swvariables", verbose=verbose)
67
+ end
68
+
69
+ def config=(config_hash)
70
+ @config ||= config_hash
71
+ end
72
+ # Saves the provided data into a local database for this scraper. Data is upserted
73
+ # into this table (inserted if it does not exist, updated if the unique keys say it
74
+ # does).
75
+ #
76
+ # === Parameters
77
+ #
78
+ # * _unique_keys_ = A list of column names, that used together should be unique
79
+ # * _data_ = A hash of the data where the Key is the column name, the Value the row
80
+ # value. If sending lots of data this can be a array of hashes.
81
+ # * _table_name_ = The name that the newly created table should use (default is 'swdata').
82
+ # * _verbose_ = A verbosity level (not currently implemented, and there just to avoid breaking existing code)
83
+ #
84
+ # === Example
85
+ # ScraperWiki::save(['id'], {'id'=>1})
86
+ #
87
+ def save_sqlite(unique_keys, data, table_name="swdata",_verbose=0)
88
+ converted_data = convert_data(data)
89
+ sqlite_magic_connection.save_data(unique_keys, converted_data, table_name)
90
+ end
91
+
92
+ # legacy alias for #save_sqlite method, so works with older scrapers
93
+ def save(*args)
94
+ save_sqlite(*args)
95
+ end
96
+
97
+ def sqliteexecute(query,data=nil, verbose=2)
98
+ sqlite_magic_connection.execute(query,data)
99
+ end
100
+
101
+ def close_sqlite
102
+ sqlite_magic_connection.close
103
+ @sqlite_magic_connection = nil
104
+ end
105
+
106
+ # Allows the user to retrieve a previously saved variable
107
+ #
108
+ # === Parameters
109
+ #
110
+ # * _name_ = The variable name to fetch
111
+ # * _default_ = The value to use if the variable name is not found
112
+ # * _verbose_ = A verbosity level (not currently implemented, and there just to avoid breaking existing code)
113
+ #
114
+ # === Example
115
+ # ScraperWiki.get_var('current', 0)
116
+ #
117
+ def get_var(name, default=nil, _verbose=2)
118
+ result = sqlite_magic_connection.execute("select value_blob, type from swvariables where name=?", [name])
119
+ return default if result.empty?
120
+ result_val = result.first['value_blob']
121
+ case result.first['type']
122
+ when 'Fixnum'
123
+ result_val.to_i
124
+ when 'Float'
125
+ result_val.to_f
126
+ when 'NilClass'
127
+ nil
128
+ else
129
+ result_val
196
130
  end
197
-
198
- def ScraperWiki.raisesqliteerror(rerror)
199
- if /sqlite3.Error: no such table:/.match(rerror) # old dataproxy
200
- raise NoSuchTableSqliteException.new(rerror)
201
- end
202
- if /DB Error: \(OperationalError\) no such table:/.match(rerror)
203
- raise NoSuchTableSqliteException.new(rerror)
204
- end
205
- raise SqliteException.new(rerror)
131
+ rescue SqliteMagic::NoSuchTable
132
+ return default
133
+ end
134
+
135
+ # Allows the user to save a single variable (at a time) to carry state across runs of
136
+ # the scraper.
137
+ #
138
+ # === Parameters
139
+ #
140
+ # * _name_ = The variable name
141
+ # * _value_ = The value of the variable
142
+ # * _verbose_ = A verbosity level (not currently implemented, and there just to avoid breaking existing code)
143
+ #
144
+ # === Example
145
+ # ScraperWiki.save_var('current', 100)
146
+ #
147
+ def save_var(name, value, _verbose=2)
148
+ val_type = value.class.to_s
149
+ unless ['Fixnum','String','Float','NilClass'].include?(val_type)
150
+ puts "*** object of type #{val_type} converted to string\n"
206
151
  end
207
152
 
208
- # Allows for a simplified select statement
209
- #
210
- # === Parameters
211
- #
212
- # * _sqlquery_ = A valid select statement, without the select keyword
213
- # * _data_ = Any data provided for ? replacements in the query
214
- # * _verbose_ = A verbosity level
215
- #
216
- # === Returns
217
- # A list of hashes containing the returned data
218
- #
219
- # === Example
220
- # ScraperWiki::select('* from swdata')
221
- #
222
- def ScraperWiki.select(sqlquery, data=nil, verbose=1)
223
- if data != nil && sqlquery.scan(/\?/).length != 0 && data.class != Array
224
- data = [data]
225
- end
226
- result = ScraperWiki.sqliteexecute("select "+sqlquery, data, verbose)
227
- res = [ ]
228
- for d in result["data"]
229
- #res.push(Hash[result["keys"].zip(d)]) # post-1.8.7
230
- res.push(Hash[*result["keys"].zip(d).flatten]) # pre-1.8.7
231
- end
232
- return res
233
- end
153
+ data = { :name => name.to_s, :value_blob => value.to_s, :type => val_type }
154
+ sqlite_magic_connection.save_data([:name], data, 'swvariables')
155
+ end
156
+
157
+ # Allows for a simplified select statement
158
+ #
159
+ # === Parameters
160
+ #
161
+ # * _sqlquery_ = A valid select statement, without the select keyword
162
+ # * _data_ = Bind variables provided for ? replacements in the query. See Sqlite3#execute for details
163
+ # * _verbose_ = A verbosity level (not currently implemented, and there just to avoid breaking existing code)
164
+ #
165
+ # === Returns
166
+ # An array of hashes containing the returned data
167
+ #
168
+ # === Example
169
+ # ScraperWiki.select('* from swdata')
170
+ #
171
+ def select(sqlquery, data=nil, _verbose=1)
172
+ sqlite_magic_connection.execute("SELECT "+sqlquery, data)
173
+ end
174
+
175
+ # Establish an SQLiteMagic::Connection (and remember it)
176
+ def sqlite_magic_connection
177
+ db = @config ? @config[:db] : 'sqlite.db'
178
+ @sqlite_magic_connection ||= SqliteMagic::Connection.new(db)
179
+ end
180
+
234
181
  end
@@ -0,0 +1,3 @@
1
+ module ScraperWiki
2
+ VERSION = '3.0.0'
3
+ end
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'scraperwiki/version'
5
+ # require File.expand_path('../lib/scraperwiki/version', __FILE__)
6
+
7
+ Gem::Specification.new do |gem|
8
+ gem.authors = ['Francis Irving']
9
+ gem.email = 'francis@scraperwiki.com'
10
+ gem.description = 'A library for scraping web pages and saving data easily'
11
+ gem.summary = 'ScraperWiki'
12
+ gem.homepage = 'http://rubygems.org/gems/scraperwiki'
13
+
14
+ gem.files = `git ls-files`.split($\)
15
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
16
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
17
+ gem.name = 'scraperwiki'
18
+ gem.require_paths = ['lib']
19
+ gem.version = ScraperWiki::VERSION
20
+
21
+ gem.add_dependency "httpclient"
22
+ gem.add_dependency "sqlite_magic"
23
+ # gem.add_dependency "sqlite3"
24
+ gem.add_development_dependency "rake"
25
+ gem.add_development_dependency "rspec"
26
+ gem.add_development_dependency "debugger"
27
+ end
@@ -0,0 +1,285 @@
1
+ require 'scraperwiki'
2
+ require 'spec_helper'
3
+ require 'debugger'
4
+
5
+ describe ScraperWiki do
6
+ before do
7
+ @dummy_sqlite_magic_connection = double('sqlite_magic_connection')
8
+ SqliteMagic::Connection.stub(:new).and_return(@dummy_sqlite_magic_connection)
9
+ end
10
+
11
+ after do
12
+ # reset cached value
13
+ ScraperWiki.instance_variable_set(:@sqlite_magic_connection, nil)
14
+ ScraperWiki.instance_variable_set(:@config, nil)
15
+ end
16
+
17
+ describe "#config=" do
18
+ it "should set config instance variable" do
19
+ ScraperWiki.config = :some_config
20
+ ScraperWiki.instance_variable_get(:@config).should == :some_config
21
+ end
22
+
23
+ end
24
+
25
+ describe 'sqlite_magic_connection' do
26
+ it 'should execute select query and bind variables to connection' do
27
+ sql_snippet = 'foo from bar WHERE "baz"=42'
28
+ @dummy_sqlite_magic_connection.should_receive(:execute).with("SELECT #{sql_snippet}", ['foo', 'bar'])
29
+ ScraperWiki.select(sql_snippet, ['foo', 'bar'])
30
+ end
31
+
32
+ context 'and no config set' do
33
+ it 'should get an SqliteMagic::Connection with default db name and no path' do
34
+ SqliteMagic::Connection.should_receive(:new).with('sqlite.db').and_return(@dummy_sqlite_magic_connection)
35
+ ScraperWiki.sqlite_magic_connection
36
+ end
37
+ end
38
+
39
+ context 'and config set' do
40
+ before do
41
+ ScraperWiki.config = {:db => '/some/location/of/sqlite_file.db'}
42
+ end
43
+
44
+ it 'should get an SqliteMagic::Connection with db set in config' do
45
+ SqliteMagic::Connection.should_receive(:new).with('/some/location/of/sqlite_file.db').and_return(@dummy_sqlite_magic_connection)
46
+ ScraperWiki.sqlite_magic_connection
47
+ end
48
+ end
49
+
50
+ it 'should cache connection' do
51
+ SqliteMagic::Connection.should_receive(:new).and_return(@dummy_sqlite_magic_connection) # just once
52
+ ScraperWiki.sqlite_magic_connection
53
+ ScraperWiki.sqlite_magic_connection
54
+ end
55
+
56
+ end
57
+
58
+ describe '#select' do
59
+ it 'should execute select query with bind variables on connection' do
60
+ sql_snippet = 'foo from bar WHERE "baz"=42'
61
+ @dummy_sqlite_magic_connection.should_receive(:execute).with("SELECT #{sql_snippet}", ['foo', 'bar'])
62
+ ScraperWiki.select(sql_snippet, ['foo', 'bar'])
63
+ end
64
+
65
+ it "should return array of hashes returned by connection" do
66
+ sqlite_magic_response = [{"animal"=>"fox"}, {"animal"=>"cat"}]
67
+ @dummy_sqlite_magic_connection.stub(:execute).and_return(sqlite_magic_response)
68
+ ScraperWiki.select('foo', ['foo', 'bar']).should == sqlite_magic_response
69
+ end
70
+
71
+ context 'and no second argument passed' do
72
+ it 'should pass nil to connection as second argument' do
73
+ sql_snippet = 'foo from bar WHERE "baz"=42'
74
+ @dummy_sqlite_magic_connection.should_receive(:execute).with("SELECT #{sql_snippet}", nil)
75
+ ScraperWiki.select(sql_snippet)
76
+ end
77
+ end
78
+ end
79
+
80
+ describe '#save_sqlite' do
81
+ before do
82
+ # don't do anything with raw_data by default
83
+ ScraperWiki.stub(:convert_data) { |raw_data| raw_data }
84
+ end
85
+
86
+ it 'should save data using :name as unique key' do
87
+ @dummy_sqlite_magic_connection.should_receive(:save_data).with(:unique_keys, anything, anything)
88
+ ScraperWiki.save_sqlite(:unique_keys, :some_data)
89
+ end
90
+
91
+ it 'should save convert data before saving' do
92
+ ScraperWiki.should_receive(:convert_data).with(:some_data).and_return(:converted_data)
93
+ @dummy_sqlite_magic_connection.should_receive(:save_data).with(anything, :converted_data, anything)
94
+ ScraperWiki.save_sqlite(:unique_keys, :some_data)
95
+ end
96
+
97
+ it 'should save data in swdata by default' do
98
+ @dummy_sqlite_magic_connection.should_receive(:save_data).with(anything, anything, 'swdata')
99
+ ScraperWiki.save_sqlite(:unique_keys, :some_data)
100
+ end
101
+
102
+ it 'should save data in given table' do
103
+ @dummy_sqlite_magic_connection.should_receive(:save_data).with(anything, anything, 'another_table')
104
+ ScraperWiki.save_sqlite(:unique_keys, :some_data, 'another_table')
105
+ end
106
+
107
+ it 'should return response from connection' do
108
+ @dummy_sqlite_magic_connection.stub(:save_data).and_return(:save_response)
109
+ ScraperWiki.save_sqlite(:unique_keys, :some_data).should == :save_response
110
+ end
111
+
112
+ it 'should return response from connection' do
113
+ @dummy_sqlite_magic_connection.stub(:save_data).and_return(:save_response)
114
+ ScraperWiki.save_sqlite(:unique_keys, :some_data).should == :save_response
115
+ end
116
+ end
117
+
118
+ describe '#execute' do
119
+ it 'execute query on sqlite_magic_connection' do
120
+ @dummy_sqlite_magic_connection.should_receive(:execute).with(:some_query, :some_data)
121
+ ScraperWiki.sqliteexecute(:some_query, :some_data)
122
+ end
123
+
124
+ it 'should return result of execute query' do
125
+ @dummy_sqlite_magic_connection.stub(:execute).and_return(:query_result)
126
+ ScraperWiki.sqliteexecute(:some_query, :some_data).should == :query_result
127
+ end
128
+ end
129
+
130
+ describe '#save_var' do
131
+ it 'should save data using :name as unique key' do
132
+ @dummy_sqlite_magic_connection.should_receive(:save_data).with([:name], anything, anything)
133
+ ScraperWiki.save_var(:foo, 'bar')
134
+ end
135
+
136
+ it 'should save data as string with data class as :type' do
137
+ @dummy_sqlite_magic_connection.should_receive(:save_data).with(anything, {:name => 'foo', :value_blob => 'bar', :type => 'String'}, anything)
138
+ @dummy_sqlite_magic_connection.should_receive(:save_data).with(anything, {:name => 'meaning_of_life', :value_blob => '42', :type => 'Fixnum'}, anything)
139
+ ScraperWiki.save_var(:foo, 'bar')
140
+ ScraperWiki.save_var(:meaning_of_life, 42)
141
+ end
142
+
143
+ it 'should save data in "swvariables"' do
144
+ @dummy_sqlite_magic_connection.should_receive(:save_data).with(anything, anything, "swvariables")
145
+ ScraperWiki.save_var(:foo, 'bar')
146
+ end
147
+ end
148
+
149
+ describe '#get_var' do
150
+ it 'should select data using given key' do
151
+ @dummy_sqlite_magic_connection.should_receive(:execute).
152
+ with("select value_blob, type from swvariables where name=?", [:foo]).
153
+ and_return([{'value_blob' => 'bar', 'type' => 'String'}])
154
+ ScraperWiki.get_var(:foo)
155
+ end
156
+
157
+ it 'should return data returned by sqlite_magic_connection' do
158
+ @dummy_sqlite_magic_connection.stub(:execute).
159
+ and_return([{'value_blob' => 'bar', 'type' => 'String'}])
160
+ ScraperWiki.get_var(:foo).should == 'bar'
161
+ end
162
+
163
+ it 'should cast Fixnum data to integer' do
164
+ @dummy_sqlite_magic_connection.stub(:execute).
165
+ and_return([{'value_blob' => '42', 'type' => 'Fixnum'}])
166
+ ScraperWiki.get_var(:foo).should == 42
167
+ end
168
+
169
+ it 'should cast Float data to float' do
170
+ @dummy_sqlite_magic_connection.stub(:execute).
171
+ and_return([{'value_blob' => '0.234', 'type' => 'Float'}])
172
+ ScraperWiki.get_var(:foo).should == '0.234'.to_f
173
+ end
174
+
175
+ it 'should cast Nil data to nil' do
176
+ @dummy_sqlite_magic_connection.stub(:execute).
177
+ and_return([{'value_blob' => 'nil', 'type' => 'NilClass'}])
178
+ ScraperWiki.get_var(:foo).should be_nil
179
+ end
180
+
181
+ context 'and connection returns empty array' do
182
+ before do
183
+ @dummy_sqlite_magic_connection.stub(:execute).
184
+ and_return([])
185
+ end
186
+
187
+ it "should return nil" do
188
+ ScraperWiki.get_var(:foo).should be_nil
189
+ end
190
+
191
+ it "should return default if default given" do
192
+ ScraperWiki.get_var(:foo, 'bar').should == 'bar'
193
+ end
194
+ end
195
+
196
+ context 'and SqliteMagic::NoSuchTable raised' do
197
+ before do
198
+ @dummy_sqlite_magic_connection.stub(:execute).
199
+ and_raise(SqliteMagic::NoSuchTable)
200
+ end
201
+
202
+ it "should return nil" do
203
+ ScraperWiki.get_var(:foo).should be_nil
204
+ end
205
+
206
+ it "should return default if default given" do
207
+ ScraperWiki.get_var(:foo, 'bar').should == 'bar'
208
+ end
209
+ end
210
+
211
+ context 'and other error raised' do
212
+ before do
213
+ @dummy_sqlite_magic_connection.stub(:execute).
214
+ and_raise
215
+ end
216
+
217
+ it "should raise error" do
218
+ lambda { ScraperWiki.get_var(:foo)}.should raise_error
219
+ end
220
+ end
221
+ end
222
+
223
+ describe '#close_sqlite' do
224
+ it 'should execute query on sqlite_magic_connection' do
225
+ @dummy_sqlite_magic_connection.should_receive(:close)
226
+ ScraperWiki.close_sqlite
227
+ end
228
+
229
+ it 'should lose cached connection' do
230
+ @dummy_sqlite_magic_connection.stub(:close)
231
+ ScraperWiki.close_sqlite
232
+ ScraperWiki.instance_variable_get(:@sqlite_magic_connection).should be_nil
233
+ end
234
+ end
235
+
236
+ describe "#save" do
237
+ it "should delegate to #save_sqlite" do
238
+ ScraperWiki.should_receive(:save_sqlite).with(:foo, :bar).and_return(:result)
239
+ ScraperWiki.save(:foo, :bar).should == :result
240
+ end
241
+ end
242
+
243
+ describe "#convert_data" do
244
+ it "should return nil if passed nil" do
245
+ ScraperWiki.convert_data(nil).should == nil
246
+ end
247
+
248
+ it "should return empty array if passed empty array" do
249
+ ScraperWiki.convert_data([]).should == []
250
+ end
251
+
252
+ context 'and passed a hash' do
253
+ it "should return array containing hash if passed hash" do
254
+ ScraperWiki.convert_data({:foo => 'bar'}).should == [{:foo => 'bar'}]
255
+ end
256
+
257
+ it "should convert date, time and datetime to iso8601" do
258
+ date = Date.today
259
+ time = Time.now
260
+ datetime = (Time.now - 100).to_datetime
261
+ values_hash = {:foo => 'bar', :date_val => date, :time_val => time, :datetime_val => datetime}
262
+ expected_result = [{:foo => 'bar', :date_val => date.iso8601, :time_val => time.utc.iso8601.sub(/([+-]00:00|Z)$/, ''), :datetime_val => datetime.to_s}]
263
+ ScraperWiki.convert_data(values_hash).should == expected_result
264
+ end
265
+ end
266
+
267
+ context 'and passed an array of hashes' do
268
+ it "should return array containing hash if passed hash" do
269
+ ScraperWiki.convert_data([{:foo => 'bar'}, {:bar => 'baz'}]).should == [{:foo => 'bar'}, {:bar => 'baz'}]
270
+ end
271
+
272
+ it "should convert date, time and datetime to iso8601" do
273
+ date = Date.today
274
+ time = Time.now
275
+ datetime = (Time.now - 100).to_datetime
276
+ values_array = [{:foo => 'bar', :date_val => date}, {:time_val => time, :datetime_val => datetime}]
277
+ expected_result = [{:foo => 'bar', :date_val => date.iso8601}, {:time_val => time.utc.iso8601.sub(/([+-]00:00|Z)$/,''), :datetime_val => datetime.to_s}]
278
+ ScraperWiki.convert_data(values_array).should == expected_result
279
+ end
280
+ end
281
+ end
282
+
283
+ end
284
+
285
+
@@ -0,0 +1,5 @@
1
+ require 'rspec/autorun'
2
+ # require 'debugger'
3
+
4
+ RSpec.configure do |config|
5
+ end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraperwiki
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.6
4
+ version: 3.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
8
- - Francis irving
8
+ - Francis Irving
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-04 00:00:00.000000000 Z
12
+ date: 2013-07-17 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: httpclient
@@ -28,7 +28,7 @@ dependencies:
28
28
  - !ruby/object:Gem::Version
29
29
  version: '0'
30
30
  - !ruby/object:Gem::Dependency
31
- name: sqlite3
31
+ name: sqlite_magic
32
32
  requirement: !ruby/object:Gem::Requirement
33
33
  none: false
34
34
  requirements:
@@ -43,14 +43,70 @@ dependencies:
43
43
  - - ! '>='
44
44
  - !ruby/object:Gem::Version
45
45
  version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rake
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rspec
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: debugger
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
46
94
  description: A library for scraping web pages and saving data easily
47
95
  email: francis@scraperwiki.com
48
96
  executables: []
49
97
  extensions: []
50
98
  extra_rdoc_files: []
51
99
  files:
100
+ - .gitignore
101
+ - Gemfile
102
+ - LICENCE
103
+ - README.md
104
+ - Rakefile
52
105
  - lib/scraperwiki.rb
53
- - lib/scraperwiki/sqlite_save_info.rb
106
+ - lib/scraperwiki/version.rb
107
+ - scraperwiki.gemspec
108
+ - spec/scraperwiki_spec.rb
109
+ - spec/spec_helper.rb
54
110
  homepage: http://rubygems.org/gems/scraperwiki
55
111
  licenses: []
56
112
  post_install_message:
@@ -75,4 +131,6 @@ rubygems_version: 1.8.23
75
131
  signing_key:
76
132
  specification_version: 3
77
133
  summary: ScraperWiki
78
- test_files: []
134
+ test_files:
135
+ - spec/scraperwiki_spec.rb
136
+ - spec/spec_helper.rb
@@ -1,220 +0,0 @@
1
- # Builds schemas automatically from a hash, for SQLite databases
2
- #
3
- # Ported from ScraperWiki Classic - scraperwiki/services/datastore/datalib.py
4
- # This will make the code quite unRubyish - it is Julian Todd's Python, ported.
5
-
6
-
7
- # TODO:
8
- # Sort out 'error' bits
9
-
10
- require 'set'
11
- require 'sqlite3'
12
-
13
- module SQLiteMagic
14
- @db = nil
15
- @sqlitesaveinfo = {}
16
-
17
- def SQLiteMagic._open_db_if_necessary()
18
- if @db.nil?
19
- @db = SQLite3::Database.new("scraperwiki.sqlite")
20
- end
21
- end
22
-
23
- def SQLiteMagic._do_save_sqlite(unique_keys, data, swdatatblname)
24
- SQLiteMagic._open_db_if_necessary
25
-
26
- res = { }
27
- if data.class == Hash
28
- data = [data]
29
- end
30
-
31
- if !@sqlitesaveinfo.include?(swdatatblname)
32
- ssinfo = SqliteSaveInfo.new(swdatatblname, @db)
33
- @sqlitesaveinfo[swdatatblname] = ssinfo
34
- if not ssinfo.rebuildinfo() and data.length > 0
35
- ssinfo.buildinitialtable(data[0])
36
- ssinfo.rebuildinfo()
37
- res["tablecreated"] = swdatatblname
38
- end
39
- else
40
- ssinfo = @sqlitesaveinfo[swdatatblname]
41
- end
42
-
43
- @db.transaction()
44
-
45
- nrecords = 0
46
- data.each do |ldata|
47
- newcols = ssinfo.newcolumns(ldata)
48
- if newcols.length > 0
49
- newcols.each_with_index do |kv, i|
50
- ssinfo.addnewcolumn(kv[0], kv[1])
51
- res["newcolumn %d" % i] = "%s %s" % kv
52
- end
53
- ssinfo.rebuildinfo()
54
- end
55
-
56
- if nrecords == 0 && unique_keys.length > 0
57
- idxname, idxkeys = ssinfo.findclosestindex(unique_keys)
58
- # puts "findclosestindex returned name:"+ idxname.to_s + " keys:" + idxkeys.to_s
59
- if !idxname || idxkeys != unique_keys.to_set
60
- lres = ssinfo.makenewindex(idxname, unique_keys)
61
- if lres.include?('error')
62
- return lres
63
- end
64
- res.merge!(lres)
65
- end
66
- end
67
-
68
- lres = ssinfo.insertdata(ldata)
69
- nrecords += 1
70
- end
71
-
72
- @db.commit()
73
- # log(nrecords + " inserted or replaced")
74
- return res
75
- end
76
-
77
- def SQLiteMagic.sqliteexecute(query,data=nil, verbose=2)
78
- SQLiteMagic._open_db_if_necessary
79
- cols,*rows = (data.nil?)? @db.execute2(query) : @db.execute2(query,data)
80
- return {"keys"=>cols, "data"=>rows} unless cols.nil? or rows.nil?
81
- end
82
-
83
- def SQLiteMagic.close()
84
- @db.close
85
- @db = nil
86
- @sqlitesaveinfo = {}
87
- end
88
-
89
-
90
- class SqliteSaveInfo
91
- def initialize(swdatatblname, db)
92
- @swdatatblname = swdatatblname
93
- @swdatakeys = [ ]
94
- @swdatatypes = [ ]
95
- @sqdatatemplate = ""
96
- @db = db
97
- end
98
-
99
- def rebuildinfo()
100
- does_exist = @db.get_first_value("select count(*) from main.sqlite_master where name=?", @swdatatblname)
101
- if does_exist == 0
102
- return false
103
- end
104
-
105
- tblinfo = @db.execute("PRAGMA main.table_info(`%s`)" % @swdatatblname)
106
- # puts "tblinfo="+ tblinfo.to_s
107
-
108
- @swdatakeys = tblinfo.map { |a| a[1] }
109
- @swdatatypes = tblinfo.map { |a| a[2] }
110
- @sqdatatemplate = format("insert or replace into main.`%s` values (%s)", @swdatatblname, (["?"]*@swdatakeys.length).join(","))
111
- return true
112
- end
113
-
114
-
115
- def buildinitialtable(data)
116
- raise "buildinitialtable: no swdatakeys" unless @swdatakeys.length == 0
117
- coldef = self.newcolumns(data)
118
- raise "buildinitialtable: no coldef" unless coldef.length > 0
119
- # coldef = coldef[:1] # just put one column in; the rest could be altered -- to prove it's good
120
- scoldef = coldef.map { |col| format("`%s` %s", col[0], col[1]) }.join(",")
121
- @db.execute(format("create table main.`%s` (%s)", @swdatatblname, scoldef))
122
- end
123
-
124
- def newcolumns(data)
125
- newcols = [ ]
126
- for k, v in data
127
- if !@swdatakeys.include?(k)
128
- if v != nil
129
- #if k[-5:] == "_blob"
130
- # vt = "blob" # coerced into affinity none
131
- if v.class == Fixnum
132
- vt = "integer"
133
- elsif v.class == Float
134
- vt = "real"
135
- else
136
- vt = "text"
137
- end
138
- newcols.push([k, vt])
139
- end
140
- end
141
- end
142
- # puts "newcols=" + newcols.to_s
143
- return newcols
144
- end
145
-
146
- def addnewcolumn(k, vt)
147
- @db.execute(format("alter table main.`%s` add column `%s` %s", @swdatatblname, k, vt))
148
- end
149
-
150
- def findclosestindex(unique_keys)
151
- idxlist = @db.execute(format("PRAGMA main.index_list(`%s`)", @swdatatblname)) # [seq,name,unique]
152
- # puts "findclosestindex: idxlist is "+ idxlist.to_s
153
- if idxlist.include?('error')
154
- return [nil, nil]
155
- end
156
-
157
- uniqueindexes = [ ]
158
- for idxel in idxlist
159
- if idxel[2]
160
- idxname = idxel[1]
161
- idxinfo = @db.execute(format("PRAGMA main.index_info(`%s`)", idxname)) # [seqno,cid,name]
162
- idxset = idxinfo.map { |a| a[2] }.to_set
163
- idxoverlap = idxset.intersection(unique_keys).length
164
- uniqueindexes.push([idxoverlap, idxname, idxset])
165
- end
166
- end
167
-
168
- if uniqueindexes.length == 0
169
- return [nil, nil]
170
- end
171
- uniqueindexes.sort()
172
- # puts "uniqueindexes=" + uniqueindexes.to_s
173
- return [uniqueindexes[-1][1], uniqueindexes[-1][2]]
174
- end
175
-
176
- # increment to next index number every time there is a change, and add the new index before dropping the old one.
177
- def makenewindex(idxname, unique_keys)
178
- istart = 0
179
- if idxname
180
- #mnum = re.search("(\d+)$", idxname)
181
- #if mnum
182
- # istart = int(mnum.group(1))
183
- #end
184
- istart = idxname.match("(\d+)$").first.to_i rescue 0
185
- end
186
- for i in 0..10000
187
- newidxname = format("%s_index%d", @swdatatblname, istart+i)
188
- does_exist = @db.get_first_value("select count(*) from main.sqlite_master where name=?", newidxname)
189
- if does_exist == 0
190
- break
191
- end
192
- end
193
-
194
- res = { "newindex" => newidxname }
195
- lres = @db.execute(format("create unique index `%s` on `%s` (%s)", newidxname, @swdatatblname, unique_keys.map { |k| format("`%s`", k) }.join(",")))
196
- if lres.include?('error')
197
- return lres
198
- end
199
- if idxname
200
- lres = @db.execute(format("drop index main.`%s`", idxname))
201
- if lres.include?('error')
202
- if lres['error'] != 'sqlite3.Error: index associated with UNIQUE or PRIMARY KEY constraint cannot be dropped'
203
- return lres
204
- end
205
- end
206
- res["droppedindex"] = idxname
207
- end
208
- return res
209
- end
210
-
211
- def insertdata(data)
212
- values = @swdatakeys.map { |k| data[k] }
213
- res = @db.query(@sqdatatemplate, values)
214
- res.close
215
- end
216
- end
217
-
218
- end
219
-
220
-