scrapi 1.2.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +10 -0
- data/MIT-LICENSE +0 -0
- data/{README → README.rdoc} +16 -5
- data/Rakefile +20 -35
- data/lib/html/htmlparser.rb +0 -0
- data/lib/html/selector.rb +0 -0
- data/lib/scraper/base.rb +5 -5
- data/lib/scraper/reader.rb +12 -11
- data/lib/tidy/libtidy.dll +0 -0
- data/lib/tidy/libtidy.so +0 -0
- data/test/mock_net_http.rb +0 -0
- data/test/node_ext_test.rb +1 -1
- data/test/reader_test.rb +27 -8
- data/test/scraper_test.rb +12 -7
- data/test/selector_test.rb +1 -1
- metadata +84 -59
data/CHANGELOG
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
Version 2.0.0 (November 10, 2010)
|
2
|
+
|
3
|
+
* Ruby 1.9.2 support using Tidy FFI, by Christoph Lupprich.
|
4
|
+
|
5
|
+
Version 1.2.1 (Upcoming)
|
6
|
+
|
7
|
+
* Added: Cheat sheets.
|
8
|
+
* Fixed: Support for redirects that return path-only locations.
|
9
|
+
Credit: Rick Wargo (http://www.rickwargo.com)
|
10
|
+
|
1
11
|
Version 1.2.0 (August 27, 2006)
|
2
12
|
|
3
13
|
* Added: collect() method called just before result().
|
data/MIT-LICENSE
CHANGED
File without changes
|
data/{README → README.rdoc}
RENAMED
@@ -40,13 +40,22 @@ To get the latest source code with regular updates:
|
|
40
40
|
|
41
41
|
svn co http://labnotes.org/svn/public/ruby/scrapi
|
42
42
|
|
43
|
+
== Version of Ruby
|
44
|
+
|
45
|
+
ScrAPI 1.2.x tested with Ruby 1.8.6 and 1.8.7, but will not work on Ruby 1.9.x.
|
46
|
+
|
47
|
+
ScrAPI 2.0.x switches to TidyFFI to runs on Ruby 1.9.2 and newer.
|
48
|
+
|
49
|
+
Due to a bug in Ruby's visibility context handling (see changelog #29578 and bug
|
50
|
+
#3406 on the official Ruby page), you need to declare all result attributes
|
51
|
+
explicitly, using result method or attr_reader/_accessor.
|
43
52
|
|
44
53
|
== Using TIDY
|
45
54
|
|
46
|
-
By default scrAPI uses Tidy to cleanup the HTML.
|
55
|
+
By default scrAPI uses Tidy (actually Tidy-FFI) to cleanup the HTML.
|
47
56
|
|
48
57
|
You need to install the Tidy Gem for Ruby:
|
49
|
-
gem install
|
58
|
+
gem install tidy_ffi
|
50
59
|
|
51
60
|
And the Tidy binary libraries, available here:
|
52
61
|
|
@@ -56,15 +65,15 @@ By default scrAPI looks for the Tidy DLL (Windows) or shared library (Linux) in
|
|
56
65
|
|
57
66
|
Alternatively, just point Tidy to the library with:
|
58
67
|
|
59
|
-
|
68
|
+
TidyFFI.library_path = "...."
|
60
69
|
|
61
70
|
On Linux this would probably be:
|
62
71
|
|
63
|
-
|
72
|
+
TidyFFI.library_path = "/usr/local/lib/libtidy.so"
|
64
73
|
|
65
74
|
On OS/X this would probably be:
|
66
75
|
|
67
|
-
|
76
|
+
TidyFFI.library_path = “/usr/lib/libtidy.dylib”
|
68
77
|
|
69
78
|
For testing purposes, you can also use the built in HTML parser. It's useful for testing and getting up to grabs with scrAPI, but it doesn't deal well with broken HTML. So for testing only:
|
70
79
|
|
@@ -86,3 +95,5 @@ HTML DOM extracted from Rails, Copyright (c) 2004 David Heinemeier Hansson. Unde
|
|
86
95
|
|
87
96
|
HTML parser by Takahiro Maebashi and Katsuyuki Komatsu, Ruby license.
|
88
97
|
http://www.jin.gr.jp/~nahi/Ruby/html-parser/README.html
|
98
|
+
|
99
|
+
Porting to Ruby 1.9.x by Christoph Lupprich, http://lupprich.info
|
data/Rakefile
CHANGED
@@ -1,12 +1,11 @@
|
|
1
1
|
require "benchmark"
|
2
2
|
require "rubygems"
|
3
|
-
Gem::manage_gems
|
4
3
|
require "rake"
|
5
4
|
require "rake/testtask"
|
6
5
|
require "rake/rdoctask"
|
7
|
-
require "rake/gempackagetask"
|
8
6
|
|
9
7
|
|
8
|
+
spec = Gem::Specification.load(File.join(File.dirname(__FILE__), 'scrapi.gemspec'))
|
10
9
|
|
11
10
|
desc "Generate documentation"
|
12
11
|
Rake::RDocTask.new(:rdoc) do |rdoc|
|
@@ -14,7 +13,7 @@ Rake::RDocTask.new(:rdoc) do |rdoc|
|
|
14
13
|
rdoc.title = "Scraper"
|
15
14
|
rdoc.options << "--line-numbers"
|
16
15
|
rdoc.options << "--inline-source"
|
17
|
-
rdoc.rdoc_files.include("README")
|
16
|
+
rdoc.rdoc_files.include("README.rdoc")
|
18
17
|
rdoc.rdoc_files.include("lib/**/*.rb")
|
19
18
|
end
|
20
19
|
|
@@ -25,42 +24,28 @@ Rake::TestTask.new(:test) do |test|
|
|
25
24
|
test.pattern = "test/**/*_test.rb"
|
26
25
|
test.verbose = true
|
27
26
|
end
|
27
|
+
task :default=>:test
|
28
28
|
|
29
29
|
|
30
|
-
|
31
|
-
gem_spec = Gem::Specification.new do |spec|
|
30
|
+
spec = Gem::Specification.load(Dir["*.gemspec"].first)
|
32
31
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
break
|
38
|
-
end
|
39
|
-
end
|
40
|
-
raise RuntimeError, "Can't find version number in changelog" unless version
|
41
|
-
|
42
|
-
spec.name = "scrapi"
|
43
|
-
spec.version = version
|
44
|
-
spec.summary = "scrAPI toolkit for Ruby. Uses CSS selectors to write easy, maintainable HTML scraping rules."
|
45
|
-
spec.description = <<-EOF
|
46
|
-
scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
|
47
|
-
EOF
|
48
|
-
spec.author = "Assaf Arkin"
|
49
|
-
spec.email = "assaf.arkin@gmail.com"
|
50
|
-
spec.homepage = "http://blog.labnotes.org/category/scrapi/"
|
32
|
+
desc "Build the Gem"
|
33
|
+
task :build do
|
34
|
+
sh "gem build #{spec.name}.gemspec"
|
35
|
+
end
|
51
36
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
spec.
|
56
|
-
spec.add_dependency "tidy", ">=1.1.0"
|
57
|
-
spec.has_rdoc = true
|
58
|
-
spec.rdoc_options << "--main" << "README" << "--title" << "scrAPI toolkit for Ruby" << "--line-numbers"
|
59
|
-
spec.extra_rdoc_files = ["README"]
|
60
|
-
spec.rubyforge_project = "scrapi"
|
37
|
+
desc "Install #{spec.name} locally"
|
38
|
+
task :install=>:build do
|
39
|
+
sudo = "sudo" unless File.writable?( Gem::ConfigMap[:bindir])
|
40
|
+
sh "#{sudo} gem install #{spec.name}-#{spec.version}.gem"
|
61
41
|
end
|
62
42
|
|
63
|
-
|
64
|
-
|
65
|
-
|
43
|
+
desc "Push new release to gemcutter and git tag"
|
44
|
+
task :push=>["test", "build"] do
|
45
|
+
sh "git push"
|
46
|
+
puts "Tagging version #{spec.version} .."
|
47
|
+
sh "git tag v#{spec.version}"
|
48
|
+
sh "git push --tag"
|
49
|
+
puts "Building and pushing gem .."
|
50
|
+
sh "gem push #{spec.name}-#{spec.version}.gem"
|
66
51
|
end
|
data/lib/html/htmlparser.rb
CHANGED
File without changes
|
data/lib/html/selector.rb
CHANGED
File without changes
|
data/lib/scraper/base.rb
CHANGED
@@ -327,7 +327,7 @@ module Scraper
|
|
327
327
|
# The following options are supported for parsing the HTML:
|
328
328
|
# * <tt>:root_element</tt> -- The root element to scrape, see
|
329
329
|
# also #root_elements.
|
330
|
-
# * <tt>:
|
330
|
+
# * <tt>:parser</tt> -- Specifies which parser to use.
|
331
331
|
# (Typically, you set this for the class).
|
332
332
|
# * <tt>:parser_options</tt> -- Options to pass to the parser.
|
333
333
|
#
|
@@ -906,10 +906,10 @@ module Scraper
|
|
906
906
|
# end
|
907
907
|
def skip(elements = nil)
|
908
908
|
case elements
|
909
|
-
when Array
|
910
|
-
when HTML::Node
|
911
|
-
when nil
|
912
|
-
when true, false
|
909
|
+
when Array then @skip.concat elements
|
910
|
+
when HTML::Node then @skip << elements
|
911
|
+
when nil then @skip << true
|
912
|
+
when true, false then @skip << elements
|
913
913
|
end
|
914
914
|
# Calling skip(element) as the last statement is
|
915
915
|
# redundant by design.
|
data/lib/scraper/reader.rb
CHANGED
@@ -10,7 +10,7 @@ require "net/http"
|
|
10
10
|
require "net/https"
|
11
11
|
begin
|
12
12
|
require "rubygems"
|
13
|
-
require "
|
13
|
+
require "tidy_ffi"
|
14
14
|
rescue LoadError
|
15
15
|
end
|
16
16
|
|
@@ -95,6 +95,7 @@ module Scraper
|
|
95
95
|
# * :redirect_limit -- Number of redirects allowed (default is 3).
|
96
96
|
# * :user_agent -- The User-Agent header to send.
|
97
97
|
# * :timeout -- HTTP open connection/read timeouts (in second).
|
98
|
+
# * :ssl_verify_mode -- SSL verification mode, defaults to OpenSSL::SSL::VERIFY_NONE
|
98
99
|
#
|
99
100
|
# It returns a hash with the following information:
|
100
101
|
# * :url -- The URL of the requested page (may change by permanent redirect)
|
@@ -123,6 +124,7 @@ module Scraper
|
|
123
124
|
begin
|
124
125
|
http = Net::HTTP.new(uri.host, uri.port)
|
125
126
|
http.use_ssl = (uri.scheme == "https")
|
127
|
+
http.verify_mode = options[:ssl_verify_mode] || OpenSSL::SSL::VERIFY_NONE
|
126
128
|
http.close_on_empty_response = true
|
127
129
|
http.open_timeout = http.read_timeout = options[:http_timeout] || DEFAULT_TIMEOUT
|
128
130
|
path = uri.path.dup # required so we don't modify path
|
@@ -153,12 +155,12 @@ module Scraper
|
|
153
155
|
return Page[(options[:source_url] || uri), nil, nil,
|
154
156
|
options[:last_modified], options[:etag]]
|
155
157
|
when Net::HTTPMovedPermanently
|
156
|
-
return read_page(response["location"], # New URL takes effect
|
158
|
+
return read_page((uri.merge(response["location"]) rescue nil), # New URL takes effect
|
157
159
|
:last_modified=>options[:last_modified],
|
158
160
|
:etag=>options[:etag],
|
159
161
|
:redirect_limit=>redirect_limit-1)
|
160
162
|
when Net::HTTPRedirection
|
161
|
-
return read_page(response["location"],
|
163
|
+
return read_page((uri.merge(response["location"]) rescue nil),
|
162
164
|
:last_modified=>options[:last_modified],
|
163
165
|
:etag=>options[:etag],
|
164
166
|
:redirect_limit=>redirect_limit-1,
|
@@ -202,10 +204,8 @@ module Scraper
|
|
202
204
|
find_tidy
|
203
205
|
options = (options || {}).update(TIDY_OPTIONS)
|
204
206
|
options[:input_encoding] = encoding.gsub("-", "").downcase
|
205
|
-
|
206
|
-
|
207
|
-
HTML::Document.new(html).find(:tag=>"html")
|
208
|
-
end
|
207
|
+
html = TidyFFI::Tidy.with_options(options).clean(content)
|
208
|
+
document = HTML::Document.new(html).find(:tag=>"html")
|
209
209
|
when :html_parser
|
210
210
|
document = HTML::HTMLParser.parse(content).root
|
211
211
|
else
|
@@ -219,17 +219,18 @@ module Scraper
|
|
219
219
|
|
220
220
|
|
221
221
|
protected
|
222
|
+
|
222
223
|
module_function
|
223
224
|
|
224
225
|
def find_tidy()
|
225
|
-
return if
|
226
|
+
return if TidyFFI.library_path
|
226
227
|
begin
|
227
|
-
|
228
|
+
TidyFFI.library_path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.so")
|
228
229
|
rescue LoadError
|
229
230
|
begin
|
230
|
-
|
231
|
+
TidyFFI.library_path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dll")
|
231
232
|
rescue LoadError
|
232
|
-
|
233
|
+
TidyFFI.library_path = File.join(File.dirname(__FILE__), "../tidy", "libtidy.dylib")
|
233
234
|
end
|
234
235
|
end
|
235
236
|
end
|
data/lib/tidy/libtidy.dll
CHANGED
File without changes
|
data/lib/tidy/libtidy.so
CHANGED
File without changes
|
data/test/mock_net_http.rb
CHANGED
File without changes
|
data/test/node_ext_test.rb
CHANGED
data/test/reader_test.rb
CHANGED
@@ -12,8 +12,8 @@ require "webrick"
|
|
12
12
|
require "webrick/https"
|
13
13
|
require "logger"
|
14
14
|
require "stringio"
|
15
|
-
require
|
16
|
-
require
|
15
|
+
require "./test/mock_net_http"
|
16
|
+
require "./lib/scrapi"
|
17
17
|
|
18
18
|
|
19
19
|
class ReaderTest < Test::Unit::TestCase
|
@@ -144,6 +144,25 @@ class ReaderTest < Test::Unit::TestCase
|
|
144
144
|
end
|
145
145
|
|
146
146
|
|
147
|
+
def test_should_support_partial_location_redirection
|
148
|
+
# Test working redirection. Redirect only once and test response URL.
|
149
|
+
# Should be new URL for permanent redirect, same URL for all other redirects.
|
150
|
+
Net::HTTP.on_get do |address, path, headers|
|
151
|
+
if path == "/somewhere"
|
152
|
+
[Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), ""]
|
153
|
+
else
|
154
|
+
response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 301, "Moved")
|
155
|
+
response["location"] = "somewhere"
|
156
|
+
[response, ""]
|
157
|
+
end
|
158
|
+
end
|
159
|
+
assert_nothing_raised() do
|
160
|
+
response = Reader.read_page("http://localhost/path?query")
|
161
|
+
assert_equal "http://localhost/somewhere", response.url.to_s
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
|
147
166
|
def test_should_use_cache_control
|
148
167
|
# Test Last Modified and ETag headers. First, that they are correctly
|
149
168
|
# returned from headers to response object. Next, that passing right
|
@@ -220,22 +239,22 @@ class ReaderTest < Test::Unit::TestCase
|
|
220
239
|
# Test content encoding returned from HTTP server.
|
221
240
|
with_webrick do |server, params|
|
222
241
|
server.mount_proc "/test.html" do |req,resp|
|
223
|
-
resp["Content-Type"] = "text/html; charset=
|
242
|
+
resp["Content-Type"] = "text/html; charset=ASCII"
|
224
243
|
resp.body = "Content comes here"
|
225
244
|
end
|
226
245
|
page = Reader.read_page(WEBRICK_TEST_URL)
|
227
246
|
page = Reader.parse_page(page.content, page.encoding)
|
228
|
-
assert_equal "
|
247
|
+
assert_equal "ASCII", page.encoding
|
229
248
|
end
|
230
249
|
# Test content encoding in HTML http-equiv header
|
231
250
|
# that overrides content encoding returned in HTTP.
|
232
251
|
with_webrick do |server, params|
|
233
252
|
server.mount_proc "/test.html" do |req,resp|
|
234
|
-
resp["Content-Type"] = "text/html; charset=
|
253
|
+
resp["Content-Type"] = "text/html; charset=ASCII"
|
235
254
|
resp.body = %Q{
|
236
255
|
<html>
|
237
256
|
<head>
|
238
|
-
<meta http-equiv="content-type" value="text/html; charset=
|
257
|
+
<meta http-equiv="content-type" value="text/html; charset=UTF-8">
|
239
258
|
</head>
|
240
259
|
<body></body>
|
241
260
|
</html>
|
@@ -243,7 +262,7 @@ class ReaderTest < Test::Unit::TestCase
|
|
243
262
|
end
|
244
263
|
page = Reader.read_page(WEBRICK_TEST_URL)
|
245
264
|
page = Reader.parse_page(page.content, page.encoding)
|
246
|
-
assert_equal "
|
265
|
+
assert_equal "UTF-8", page.encoding
|
247
266
|
end
|
248
267
|
end
|
249
268
|
|
@@ -251,7 +270,7 @@ class ReaderTest < Test::Unit::TestCase
|
|
251
270
|
begin
|
252
271
|
options = WEBRICK_OPTIONS.dup.update(
|
253
272
|
:SSLEnable=>true,
|
254
|
-
:SSLVerifyClient =>
|
273
|
+
:SSLVerifyClient => OpenSSL::SSL::VERIFY_NONE,
|
255
274
|
:SSLCertName => [ ["C","JP"], ["O","WEBrick.Org"], ["CN", "WWW"] ]
|
256
275
|
)
|
257
276
|
server = WEBrick::HTTPServer.new(options)
|
data/test/scraper_test.rb
CHANGED
@@ -8,8 +8,8 @@
|
|
8
8
|
require "rubygems"
|
9
9
|
require "time"
|
10
10
|
require "test/unit"
|
11
|
-
require
|
12
|
-
require
|
11
|
+
require "./test/mock_net_http"
|
12
|
+
require "./lib/scrapi"
|
13
13
|
|
14
14
|
|
15
15
|
class ScraperTest < Test::Unit::TestCase
|
@@ -287,6 +287,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
287
287
|
scraper = new_scraper(html) do
|
288
288
|
process "#1", :this1=>:text
|
289
289
|
process "#1", :this2=>:text
|
290
|
+
attr_reader :this1, :this2
|
290
291
|
end
|
291
292
|
scraper.scrape
|
292
293
|
assert_equal "this", scraper.this1
|
@@ -295,16 +296,18 @@ class ScraperTest < Test::Unit::TestCase
|
|
295
296
|
scraper = new_scraper(html) do
|
296
297
|
process "#1", :this1=>:text, :skip=>false
|
297
298
|
process "#1", :this2=>:text
|
299
|
+
attr_reader :this1, :this2
|
298
300
|
end
|
299
301
|
scraper.scrape
|
300
302
|
assert_equal "this", scraper.this1
|
301
303
|
assert_equal "this", scraper.this2
|
302
304
|
|
303
305
|
scraper = new_scraper(html) do
|
304
|
-
process "#1", :this1=>:text, :skip=>true do
|
305
|
-
|
306
|
+
process "#1", :this1=>:text, :skip=>true do |element|
|
307
|
+
element
|
306
308
|
end
|
307
309
|
process "#1", :this2=>:text
|
310
|
+
attr_reader :this1, :this2
|
308
311
|
end
|
309
312
|
scraper.scrape
|
310
313
|
assert_equal "this", scraper.this1
|
@@ -351,7 +354,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
351
354
|
[response, <<-EOF
|
352
355
|
<html>
|
353
356
|
<head>
|
354
|
-
<meta http-equiv="content-type" value="text/html; charset=
|
357
|
+
<meta http-equiv="content-type" value="text/html; charset=ASCII">
|
355
358
|
</head>
|
356
359
|
<body>
|
357
360
|
<div id="x"/>
|
@@ -371,7 +374,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
371
374
|
assert_equal "http://localhost/redirect", scraper.page_info.url.to_s
|
372
375
|
assert_equal time, scraper.page_info.last_modified
|
373
376
|
assert_equal "etag", scraper.page_info.etag
|
374
|
-
assert_equal "
|
377
|
+
assert_equal "ASCII", scraper.page_info.encoding
|
375
378
|
end
|
376
379
|
|
377
380
|
|
@@ -563,6 +566,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
563
566
|
process "h1", [:text, :kls]=>Scraper.define {
|
564
567
|
process "*", :text=>:text, :kls=>"@class"
|
565
568
|
}
|
569
|
+
attr_reader :text, :kls
|
566
570
|
end
|
567
571
|
result = scraper.scrape
|
568
572
|
assert "first", result.text
|
@@ -618,6 +622,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
618
622
|
|
619
623
|
scraper = new_scraper(DIVS_ST_ND) do
|
620
624
|
process_first "div", :div_id=>"@id", :div_text=>:text
|
625
|
+
attr_reader :div_id, :div_text
|
621
626
|
end
|
622
627
|
value = scraper.scrape
|
623
628
|
assert_equal "1", value.div_id
|
@@ -721,7 +726,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
721
726
|
# Extracting the attribute skips the second match.
|
722
727
|
scraper = new_scraper(DIVS123) do
|
723
728
|
process("div") { |element| @count +=1 }
|
724
|
-
define_method(:prepare) { @count = 1 }
|
729
|
+
define_method(:prepare) { |element| @count = 1 }
|
725
730
|
define_method(:result) { @count }
|
726
731
|
end
|
727
732
|
result = scraper.scrape
|
data/test/selector_test.rb
CHANGED
metadata
CHANGED
@@ -1,82 +1,107 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.0
|
3
|
-
specification_version: 1
|
4
2
|
name: scrapi
|
5
3
|
version: !ruby/object:Gem::Version
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
-
|
11
|
-
|
12
|
-
homepage: http://blog.labnotes.org/category/scrapi/
|
13
|
-
rubyforge_project: scrapi
|
14
|
-
description: scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
|
15
|
-
autorequire: scrapi.rb
|
16
|
-
default_executable:
|
17
|
-
bindir: bin
|
18
|
-
has_rdoc: true
|
19
|
-
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">"
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 0.0.0
|
24
|
-
version:
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 2
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
version: 2.0.0
|
25
10
|
platform: ruby
|
26
|
-
signing_key:
|
27
|
-
cert_chain:
|
28
|
-
post_install_message:
|
29
11
|
authors:
|
30
12
|
- Assaf Arkin
|
13
|
+
autorequire: scrapi.rb
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-11-10 00:00:00 -08:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: tidy_ffi
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
- 1
|
31
|
+
- 2
|
32
|
+
version: 0.1.2
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
description: |
|
36
|
+
scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
|
37
|
+
|
38
|
+
email: assaf@labnotes.org
|
39
|
+
executables: []
|
40
|
+
|
41
|
+
extensions: []
|
42
|
+
|
43
|
+
extra_rdoc_files:
|
44
|
+
- README.rdoc
|
31
45
|
files:
|
46
|
+
- test/mock_net_http.rb
|
32
47
|
- test/node_ext_test.rb
|
48
|
+
- test/reader_test.rb
|
33
49
|
- test/scraper_test.rb
|
34
|
-
- test/mock_net_http.rb
|
35
50
|
- test/selector_test.rb
|
36
|
-
-
|
37
|
-
- lib/
|
38
|
-
- lib/
|
39
|
-
- lib/tidy
|
40
|
-
- lib/html
|
41
|
-
- lib/scraper/reader.rb
|
42
|
-
- lib/scraper/base.rb
|
43
|
-
- lib/scraper/microformats.rb
|
44
|
-
- lib/tidy/libtidy.so
|
45
|
-
- lib/tidy/libtidy.dll
|
51
|
+
- lib/html/document.rb
|
52
|
+
- lib/html/htmlparser.rb
|
53
|
+
- lib/html/node.rb
|
46
54
|
- lib/html/node_ext.rb
|
47
55
|
- lib/html/selector.rb
|
48
|
-
- lib/html/node.rb
|
49
|
-
- lib/html/version.rb
|
50
56
|
- lib/html/tokenizer.rb
|
51
|
-
- lib/html/
|
52
|
-
- lib/
|
53
|
-
-
|
57
|
+
- lib/html/version.rb
|
58
|
+
- lib/scraper/base.rb
|
59
|
+
- lib/scraper/microformats.rb
|
60
|
+
- lib/scraper/reader.rb
|
61
|
+
- lib/scrapi.rb
|
62
|
+
- lib/tidy/libtidy.dll
|
63
|
+
- lib/tidy/libtidy.so
|
64
|
+
- README.rdoc
|
54
65
|
- CHANGELOG
|
55
66
|
- Rakefile
|
56
67
|
- MIT-LICENSE
|
57
|
-
|
68
|
+
has_rdoc: true
|
69
|
+
homepage: http://github.com/assaf/scrapi
|
70
|
+
licenses: []
|
58
71
|
|
72
|
+
post_install_message:
|
59
73
|
rdoc_options:
|
60
74
|
- --main
|
61
|
-
- README
|
75
|
+
- README.rdoc
|
62
76
|
- --title
|
63
77
|
- scrAPI toolkit for Ruby
|
64
78
|
- --line-numbers
|
65
|
-
|
66
|
-
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
none: false
|
83
|
+
requirements:
|
84
|
+
- - ">="
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
segments:
|
87
|
+
- 1
|
88
|
+
- 9
|
89
|
+
- 1
|
90
|
+
version: 1.9.1
|
91
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
segments:
|
97
|
+
- 0
|
98
|
+
version: "0"
|
71
99
|
requirements:
|
72
|
-
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
- !ruby/object:Gem::Version
|
81
|
-
version: 1.1.0
|
82
|
-
version:
|
100
|
+
- Tidy_ffi
|
101
|
+
rubyforge_project: scrapi
|
102
|
+
rubygems_version: 1.3.7
|
103
|
+
signing_key:
|
104
|
+
specification_version: 3
|
105
|
+
summary: scrAPI toolkit for Ruby. Uses CSS selectors to write easy, maintainable HTML scraping rules.
|
106
|
+
test_files: []
|
107
|
+
|