Scrapah 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,32 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /test/tmp/
9
+ /test/version_tmp/
10
+ /tmp/
11
+
12
+ ## Documentation cache and generated files:
13
+ /.yardoc/
14
+ /_yardoc/
15
+ /doc/
16
+ /rdoc/
17
+
18
+ ## Environment normalisation:
19
+ /.bundle/
20
+ /lib/bundler/man/
21
+
22
+ # for a library or gem, you might want to ignore these files since the code is
23
+ # intended to run in multiple environments; otherwise, check them in:
24
+ # Gemfile.lock
25
+ # .ruby-version
26
+ # .ruby-gemset
27
+
28
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
29
+ .rvmrc
30
+
31
+ # Ignore Saved cache
32
+ */cache/*
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gemspec
@@ -0,0 +1,38 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ Scrapah (1.0.0)
5
+ headless (>= 1.0.1)
6
+ json (>= 1.8.1)
7
+ nokogiri (>= 1.6.0)
8
+ retryable (>= 1.3.3)
9
+ watir-webdriver (>= 0.6.4)
10
+
11
+ GEM
12
+ remote: https://rubygems.org/
13
+ specs:
14
+ childprocess (0.3.9)
15
+ ffi (~> 1.0, >= 1.0.11)
16
+ ffi (1.9.3)
17
+ headless (1.0.1)
18
+ json (1.8.1)
19
+ mini_portile (0.5.2)
20
+ multi_json (1.8.2)
21
+ nokogiri (1.6.0)
22
+ mini_portile (~> 0.5.0)
23
+ retryable (1.3.3)
24
+ rubyzip (1.0.0)
25
+ selenium-webdriver (2.37.0)
26
+ childprocess (>= 0.2.5)
27
+ multi_json (~> 1.0)
28
+ rubyzip (~> 1.0.0)
29
+ websocket (~> 1.0.4)
30
+ watir-webdriver (0.6.4)
31
+ selenium-webdriver (>= 2.18.0)
32
+ websocket (1.0.7)
33
+
34
+ PLATFORMS
35
+ ruby
36
+
37
+ DEPENDENCIES
38
+ Scrapah!
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2013 Simon Gruening
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
6
+ this software and associated documentation files (the "Software"), to deal in
7
+ the Software without restriction, including without limitation the rights to
8
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9
+ the Software, and to permit persons to whom the Software is furnished to do so,
10
+ subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,4 @@
1
+
2
+ require 'bundler'
3
+
4
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1,16 @@
1
+
2
+
3
+ # Scrapah
4
+
5
+ # / ZirconCode
6
+
7
+
8
+ module Scrapah
9
+
10
+ require 'scrapah/cache.rb'
11
+ require 'scrapah/scraper.rb'
12
+ require 'scrapah/extract.rb'
13
+
14
+ end
15
+
16
+
@@ -0,0 +1,72 @@
1
+
2
+
3
+ require 'json'
4
+
5
+ module Scrapah
6
+
7
+ class Cache
8
+
9
+ # TODO: 'throws away' whole cache after timeout,
10
+ # -> treat entries as seperate objects/files/dates
11
+
12
+ @@cache_dir = 'cache/'
13
+
14
+ def initialize()
15
+ Dir.mkdir(@@cache_dir) unless File.exists?(@@cache_dir)
16
+ @Cache = Hash.new
17
+ @keep_time = 1*24*60 # in minutes
18
+ end
19
+
20
+ def store(key,content)
21
+ @Cache[key] = content
22
+ end
23
+
24
+ def get(key)
25
+ @Cache[key]
26
+ end
27
+
28
+ def has_key?(key)
29
+ @Cache.has_key? key
30
+ end
31
+
32
+ def clear()
33
+ @Cache = Hash.new
34
+ end
35
+
36
+ def save
37
+ # WARNING: Symbols converted to Strings
38
+ f = File.new(@@cache_dir+Time.now.to_i.to_s,'w')
39
+ JSON.dump(@Cache,f)
40
+ f.close
41
+ end
42
+
43
+
44
+ def load
45
+ f = get_newest_acceptable
46
+ @Cache = Hash.new
47
+ @Cache = JSON.load(f) unless f.nil?
48
+ f.close
49
+
50
+ @Cache
51
+ end
52
+
53
+ def get_hash
54
+ @Cache
55
+ end
56
+
57
+ private
58
+
59
+ def get_newest_acceptable()
60
+ prev = Dir.glob(@@cache_dir+'*')
61
+ if(!prev.empty?)
62
+ prev.map!{|f| f.delete(@@cache_dir).to_i}
63
+ prev.sort!
64
+ return File.new(@@cache_dir+prev.last.to_s,"r") if(Time.now.to_i-prev.last < @keep_time*60)
65
+ end
66
+
67
+ nil
68
+ end
69
+
70
+ end
71
+
72
+ end
@@ -0,0 +1,45 @@
1
+
2
+ module Scrapah
3
+
4
+ class Extract
5
+
6
+ # These methods are Not Perfect, but Good enough
7
+ # They are designed to extract from plain text
8
+ # Non-obfuscated plain text...
9
+ # Please don't spam me =(
10
+
11
+ # TODO, make case insensitive and remove one set?
12
+ def self.emails(content)
13
+ r = Regexp.new(/\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4})\b/)
14
+ regex(content,r)
15
+ end
16
+
17
+ def self.ips(content)
18
+ # very simple IPv4 regex
19
+ r = Regexp.new(/\b((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))\b/)
20
+ regex(content,r)
21
+ end
22
+
23
+ def self.proxies(content)
24
+ # ex. IPadress:port
25
+ r = Regexp.new(/\b((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\:[0-9]{1,5})\b/)
26
+ regex(content,r)
27
+ end
28
+
29
+ def self.regex(content, regex)
30
+ # deals with nokogiri and misc
31
+ if content.respond_to?(:to_s) && !content.is_a?(String)
32
+ content = content.to_s
33
+ end
34
+
35
+ results = []
36
+ results << content.scan(regex)
37
+ results = results.flatten.uniq
38
+
39
+ results
40
+ end
41
+
42
+
43
+ end
44
+
45
+ end
@@ -0,0 +1,175 @@
1
+
2
+
3
+ require 'nokogiri'
4
+
5
+ require 'retryable'
6
+
7
+ # TODO optional requires?
8
+ require 'open-uri'
9
+ require 'watir-webdriver'
10
+ require 'headless' # needs xvfb installed
11
+
12
+
13
+ module Scrapah
14
+
15
+ class Scraper
16
+
17
+ include Retryable
18
+
19
+ # TODO needs full url for caching to work properly atm
20
+
21
+ # TODO Patterns class, for recursive-autodiscovery proxy-switching etc... ?
22
+
23
+ def initialize(scrape_type=:openuri, caching=false)
24
+ @access_type = scrape_type
25
+ @current_url = ''
26
+
27
+ @caching = caching
28
+ if @caching
29
+ @cache = Scrapah::Cache.new
30
+ @cache.load
31
+ end
32
+
33
+ # .start automatically?
34
+ end
35
+
36
+
37
+ def start()
38
+ # start headless
39
+ if(@access_type == :headless)
40
+ @headless = Headless.new
41
+ @headless.start
42
+ @browser = Watir::Browser.new #default browser
43
+ end
44
+ end
45
+
46
+ def stop()
47
+ # end headless/close stuff
48
+ if(@access_type == :headless)
49
+ @browser.close
50
+ @headless.destroy
51
+ end
52
+ end
53
+
54
+
55
+ def visit(url)
56
+ # cache the url
57
+
58
+ @current_url = url
59
+
60
+ return nil if !@caching
61
+
62
+ doc = get_appropriate(url)
63
+
64
+ @cache.store(url,doc.to_s)
65
+ @cache.save #TODO ???
66
+ end
67
+
68
+ def get(url)
69
+ # visit(url) if caching and not cached
70
+ # return result
71
+ @current_url = url
72
+
73
+ if(@caching)
74
+ go(url) if !@cache.has_key? url
75
+ Nokogiri::HTML(@cache.get(url))
76
+ else
77
+ get_appropriate(url)
78
+ end
79
+ end
80
+
81
+ # TODO split process! and process ....
82
+ def process(input)
83
+ # get current_url source
84
+ doc = get(@current_url)
85
+
86
+ if input.is_a?(Hash)
87
+ result = Hash.new
88
+ input.each{|k,v| result[k] = process_appropriate(doc,v)}
89
+ return result
90
+ else
91
+ return process_appropriate(doc,input)
92
+ end
93
+
94
+ nil
95
+ end
96
+
97
+
98
+ private
99
+
100
+ # TODO retry & retry strategies
101
+ # returns nokogiri doc's
102
+ def get_appropriate(url)
103
+ retryable :tries => 4, :sleep => 1.5 do
104
+ return get_headless(url) if(@access_type == :headless)
105
+ return get_openuri(url) if(@access_type == :openuri)
106
+ end
107
+ end
108
+
109
+ def get_headless(url)
110
+ return nil if !started_headless?
111
+
112
+ @browser.goto url
113
+ Nokogiri::HTML(@browser.html)
114
+ end
115
+
116
+ def get_openuri(url)
117
+ Nokogiri::HTML(open(url))
118
+ end
119
+
120
+
121
+ def started_headless?()
122
+ if @browser.nil? || @headless.nil?
123
+ raise 'Call Scraper.start first when using :headless'
124
+ return false
125
+ end
126
+ return true
127
+ end
128
+
129
+
130
+ # accepts nokogiri doc's only atm
131
+ def process_appropriate(doc,cmd)
132
+
133
+ return process_regex(doc,cmd) if(cmd.is_a? Regexp)
134
+ return process_proc(doc,cmd) if(cmd.is_a? Proc)
135
+
136
+ if cmd.is_a?(String)
137
+ return process_xpath(doc,cmd) if cmd.start_with?("x|")
138
+ return process_css(doc,cmd) if cmd.start_with?("c|")
139
+ end
140
+
141
+ nil
142
+
143
+ end
144
+
145
+ def process_regex(doc,regex)
146
+ doc.to_s.scan(regex).flatten
147
+ end
148
+
149
+ def process_xpath(doc,xpath)
150
+ xpath.slice!('x|')
151
+ sanitize_nokogiri doc.xpath(xpath)
152
+ end
153
+
154
+ def process_css(doc,css)
155
+ css.slice!('c|')
156
+ sanitize_nokogiri doc.css(css)
157
+ end
158
+
159
+ def process_proc(doc,proc)
160
+ proc.call(doc)
161
+ end
162
+
163
+
164
+ def sanitize_nokogiri(stuff)
165
+ return stuff.to_s if(stuff.count == 1)
166
+
167
+ result = []
168
+ stuff.each{|a| result << a.to_s}
169
+ result
170
+ end
171
+
172
+
173
+ end
174
+
175
+ end
@@ -0,0 +1,4 @@
1
+
2
+ module Scrapah
3
+ VERSION = "1.0.0"
4
+ end
@@ -0,0 +1,95 @@
1
+
2
+
3
+ Scrapah
4
+ ===
5
+
6
+ Scrapah is a ruby gem for web scraping and flexible content extraction. Scrapah takes your Hashes, regex, xpath's, css, and even Proc's, and turns them into neat results. Scrapah is heavily based on Nokogiri.
7
+
8
+ Installation
9
+ ---
10
+
11
+ gem install Scrapah
12
+
13
+ # running in :headless requires xvfb to be installed on your system
14
+ # for details, see gem at https://github.com/leonid-shevtsov/headless
15
+ # > sudo apt-get install xvfb
16
+
17
+
18
+ Using Scrapah
19
+ ---
20
+
21
+ **Start** a Scraper
22
+
23
+ require 'scrapah'
24
+
25
+ # use :headless to use a browser in background
26
+ s = Scrapah::Scraper.new :openuri
27
+ s.start
28
+
29
+ Tell Scrapah where to **go**
30
+
31
+ s.visit 'https://github.com/ZirconCode'
32
+
33
+ You can pass in a **regex**
34
+
35
+ # ex. get my total contributions on github
36
+ p s.process /\b(\d*?) Total\b/
37
+
38
+ Scrapah will treat a string beginning with 'c|' as a **CSS** selector
39
+
40
+ # ex. get the title tag off a page
41
+ p s.process 'c|h1'
42
+
43
+ Scrapah will treat a string beginning with 'x|' as an **XPath**
44
+
45
+ # ex. get my popular repositories
46
+ p s.process 'x|//span[@class="repo"]/text()'
47
+
48
+ Scrapah will even take your **Proc's**
49
+
50
+ # ex. Extract all emails from a website
51
+ p s.process Proc.new{|nokogiri_doc| Scrapah::Extract.emails nokogiri_doc}
52
+
53
+ The Magic: Pass in a **Hash** of stuff, get results
54
+
55
+ s.visit 'https://github.com/ZirconCode'
56
+
57
+ profile = Hash.new
58
+ profile[:name] = Proc.new{|d| 'The Great '+d.xpath('x|//span[@class="vcard-username"]/text()').to_s}
59
+ profile[:total_contrib] = /\b(\d*?) Total\b/
60
+ profile[:join_date] = 'x|//span[@class="join-date"]/text()'
61
+ profile[:popular_repos] = 'x|//span[@class="repo"]/text()'
62
+ profile[:website] = 'x|//a[@class="url"]/text()'
63
+
64
+ p s.process(profile) # ^_^
65
+
66
+ Don't forget to stop Scrapah at the end =)
67
+
68
+ s.stop
69
+
70
+
71
+ Development
72
+ ---
73
+
74
+ # setup
75
+ git clone https://github.com/ZirconCode/Scrapah.git
76
+ cd Scrapah
77
+ bundle install
78
+
79
+ # testing
80
+ cd test/
81
+ ruby test_all.rb
82
+
83
+ # to install gem locally
84
+ rake install
85
+
86
+ # Pull requests are welcome =)
87
+
88
+
89
+ Todo
90
+ ---
91
+
92
+ * !improve the caching
93
+ * Optional Requires (ex. watir-webdriver for some)
94
+ * proxy support
95
+ * recursive-autodiscovery, proxy-switching, and other 'patterns'
@@ -0,0 +1,35 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ $:.push File.expand_path("../lib", __FILE__)
4
+ require "scrapah/version"
5
+
6
+ Gem::Specification.new do |s|
7
+
8
+ s.name = "Scrapah"
9
+ s.version = Scrapah::VERSION
10
+ s.platform = Gem::Platform::RUBY
11
+ s.authors = ["Simon Gruening"]
12
+ s.email = ["simon@zirconcode.com"]
13
+ s.license = 'MIT'
14
+ s.homepage = "https://github.com/ZirconCode/Scrapah"
15
+ s.summary = %q{Flexible Web Scraping and Content Extraction}
16
+ s.description = %q{A flexible web-scraper with built in content extraction}
17
+
18
+
19
+ s.required_ruby_version = '>= 1.9.3'
20
+
21
+ s.add_runtime_dependency 'nokogiri', '>= 1.6.0'
22
+ s.add_runtime_dependency 'retryable', '>= 1.3.3'
23
+ s.add_runtime_dependency 'json', '>= 1.8.1'
24
+
25
+ s.add_runtime_dependency 'watir-webdriver', '>= 0.6.4'
26
+ s.add_runtime_dependency 'headless', '>= 1.0.1' # needs xvfb installed
27
+
28
+ #s.add_development_dependency "rspec", "~>2.5.0"
29
+
30
+ s.files = `git ls-files`.split("\n")
31
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
32
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
33
+ s.require_paths = ["lib"]
34
+
35
+ end
@@ -0,0 +1,26 @@
1
+
2
+ <html>
3
+
4
+ <head>
5
+
6
+ </head>
7
+
8
+ <body>
9
+
10
+
11
+ Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla test@test.com pariatur?
12
+
13
+ <div class="profile">
14
+ <div class="name">Johnny Blah</div>
15
+ <span id="descr">Greatest of Johnnies</span>
16
+ <span id="descr">also is Great</span>
17
+ </div>
18
+
19
+ <p>
20
+ At 123.123.123.123 vero eos et accusamus et iusto odio dignissimos ducimus qui blanditiis praesentium voluptatum deleniti atque corrupti quos dolores et quas molestias excepturi sint occaecati cupiditate non provident, similique sunt in culpa qui officia deserunt mollitia animi, id est laborum et dolorum fuga. Et harum quidem rerum facilis est et expedita distinctio. Nam libero tempore, cum soluta nobis est eligendi optio cumque nihil impedit quo minus id quod maxime placeat facere possimus, omnis voluptas assumenda est, omnis dolor repellendus. Temporibus autem quibusdam et aut officiis debitis aut rerum necessitatibus saepe eveniet ut et voluptates repudiandae sint et molestiae non recusandae. Itaque earum rerum hic tenetur a sapiente delectus, ut aut reiciendis voluptatibus maiores alias consequatur aut perferendis doloribus asperiores repellat
21
+ </p>
22
+
23
+ </body>
24
+
25
+
26
+ </html>
@@ -0,0 +1,11 @@
1
+
2
+
3
+ require 'test/unit'
4
+
5
+
6
+ require_relative 'test_cache.rb'
7
+ require_relative 'test_extract.rb'
8
+ require_relative 'test_scraper.rb'
9
+
10
+
11
+
@@ -0,0 +1,51 @@
1
+
2
+
3
+ # Test cache.rb
4
+
5
+ require 'test/unit'
6
+ require '../lib/scrapah/cache'
7
+
8
+
9
+ class TestCache < Test::Unit::TestCase
10
+
11
+ def setup
12
+ @c = Scrapah::Cache.new
13
+ end
14
+
15
+
16
+ def test_new()
17
+ assert(@c.instance_of? Scrapah::Cache)
18
+ end
19
+
20
+ def test_store_and_get
21
+ @c.store("test",3)
22
+ assert(@c.get("test") == 3)
23
+
24
+ assert(@c.get("doesn't exist").nil?)
25
+ end
26
+
27
+ def test_has_key()
28
+ @c.store("test",3)
29
+ assert(@c.has_key? "test")
30
+ assert(!@c.has_key?("nope"))
31
+ end
32
+
33
+ def test_clear()
34
+ @c.store("test",3)
35
+
36
+ assert(@c.get("test")==3)
37
+ @c.clear
38
+ assert(@c.get("test").nil?)
39
+ end
40
+
41
+ def test_save_and_load()
42
+ @c.store("old",1)
43
+ @c.save
44
+ @c.clear
45
+ @c.load
46
+ assert(@c.get("old")==1)
47
+ end
48
+
49
+ end
50
+
51
+
@@ -0,0 +1,34 @@
1
+
2
+
3
+ # Test extract.rb
4
+
5
+ require 'test/unit'
6
+ require '../lib/scrapah/extract'
7
+
8
+
9
+ class TestExtract < Test::Unit::TestCase
10
+
11
+ def test_emails
12
+ s = "test tes.com test@test.com tete@ .org"
13
+ assert(Scrapah::Extract.emails(s).count == 1)
14
+ end
15
+
16
+ def test_ips
17
+ s = "654.123 123.143.114.123 123.143.654.123 123.143.654.12322"
18
+ assert(Scrapah::Extract.ips(s).count == 1)
19
+ end
20
+
21
+ def test_proxies
22
+ s = "123.143. 654.123 112.143.114.123:4444 123.143.654.12322"
23
+ assert(Scrapah::Extract.proxies(s).count == 1)
24
+ end
25
+
26
+ def test_regex
27
+ s = 'hello hall o ello olle o.O/'
28
+ assert(Scrapah::Extract.regex(s,/(.ll)/).count == 3)
29
+ end
30
+
31
+
32
+ end
33
+
34
+
@@ -0,0 +1,100 @@
1
+
2
+
3
+ # Test scraper.rb
4
+
5
+
6
+ require 'test/unit'
7
+ require '../lib/scrapah/scraper'
8
+ require '../lib/scrapah/extract' # for test_process
9
+
10
+
11
+ class TestScraper < Test::Unit::TestCase
12
+
13
+ def setup
14
+ # Uses fixtures/profile.html as "web access"
15
+ @fixture_profile = Dir.pwd+"/fixtures/profile.html"
16
+ end
17
+
18
+
19
+ def test_new_openuri
20
+ s = nil
21
+
22
+ assert_nothing_raised do
23
+ s = Scrapah::Scraper.new :openuri
24
+ end
25
+ assert(s.is_a? Scrapah::Scraper)
26
+ end
27
+
28
+ def test_get_openuri
29
+ s = Scrapah::Scraper.new :openuri
30
+ f = @fixture_profile
31
+
32
+ assert(s.get(f).to_s.include? 'Sed ut perspiciatis unde omnis')
33
+ end
34
+
35
+ def test_new_headless
36
+ s = nil
37
+
38
+ assert_nothing_raised do
39
+ s = Scrapah::Scraper.new :headless
40
+ end
41
+ assert(s.is_a? Scrapah::Scraper)
42
+ end
43
+
44
+ # heavy test
45
+ def test_start_stop_headless
46
+ s = Scrapah::Scraper.new :headless
47
+
48
+ assert_nothing_raised do
49
+ s.start
50
+ s.stop
51
+ end
52
+ end
53
+
54
+ # heavy test
55
+ def test_get_headless
56
+ s = Scrapah::Scraper.new :headless
57
+ f = 'file://'+@fixture_profile
58
+
59
+ s.start
60
+
61
+ assert(s.get(f).to_s.include? 'Sed ut perspiciatis unde omnis')
62
+
63
+ s.stop
64
+ end
65
+
66
+ # heavy test
67
+ def test_get_no_start_headless
68
+ s = Scrapah::Scraper.new :headless
69
+
70
+ assert_raise RuntimeError do
71
+ s.get('blah')
72
+ end
73
+ end
74
+
75
+ # Full Use Test
76
+ def test_process
77
+ s = Scrapah::Scraper.new :openuri
78
+ f = @fixture_profile
79
+
80
+ s.visit(f)
81
+
82
+ assert(s.process(/Sed ut perspiciatis/).count == 1)
83
+
84
+ h = Hash.new
85
+ h[:regex] = /Sed ut perspiciatis/
86
+ h[:proc] = Proc.new{|d| Scrapah::Extract.emails(d)}
87
+ h[:css] = "c|span"
88
+ h[:xpath] = "x|/html/body/div/div"
89
+
90
+ result = s.process(h)
91
+
92
+ assert(result[:regex].count == 1)
93
+ assert(result[:proc].count == 1)
94
+ assert(result[:css].count == 2)
95
+ assert(result[:xpath].include? 'Johnny Blah')
96
+ end
97
+
98
+
99
+ end
100
+
metadata ADDED
@@ -0,0 +1,118 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: Scrapah
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Simon Gruening
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-11-19 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: &79704310 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.6.0
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *79704310
25
+ - !ruby/object:Gem::Dependency
26
+ name: retryable
27
+ requirement: &79704080 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: 1.3.3
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *79704080
36
+ - !ruby/object:Gem::Dependency
37
+ name: json
38
+ requirement: &79703850 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: 1.8.1
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *79703850
47
+ - !ruby/object:Gem::Dependency
48
+ name: watir-webdriver
49
+ requirement: &79703620 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: 0.6.4
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *79703620
58
+ - !ruby/object:Gem::Dependency
59
+ name: headless
60
+ requirement: &79703390 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: 1.0.1
66
+ type: :runtime
67
+ prerelease: false
68
+ version_requirements: *79703390
69
+ description: A flexible web-scraper with built in content extraction
70
+ email:
71
+ - simon@zirconcode.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - .gitignore
77
+ - Gemfile
78
+ - Gemfile.lock
79
+ - LICENSE
80
+ - Rakefile
81
+ - lib/scrapah.rb
82
+ - lib/scrapah/cache.rb
83
+ - lib/scrapah/extract.rb
84
+ - lib/scrapah/scraper.rb
85
+ - lib/scrapah/version.rb
86
+ - readme.markdown
87
+ - scrapah.gemspec
88
+ - test/fixtures/profile.html
89
+ - test/test_all.rb
90
+ - test/test_cache.rb
91
+ - test/test_extract.rb
92
+ - test/test_scraper.rb
93
+ homepage: https://github.com/ZirconCode/Scrapah
94
+ licenses:
95
+ - MIT
96
+ post_install_message:
97
+ rdoc_options: []
98
+ require_paths:
99
+ - lib
100
+ required_ruby_version: !ruby/object:Gem::Requirement
101
+ none: false
102
+ requirements:
103
+ - - ! '>='
104
+ - !ruby/object:Gem::Version
105
+ version: 1.9.3
106
+ required_rubygems_version: !ruby/object:Gem::Requirement
107
+ none: false
108
+ requirements:
109
+ - - ! '>='
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ requirements: []
113
+ rubyforge_project:
114
+ rubygems_version: 1.8.11
115
+ signing_key:
116
+ specification_version: 3
117
+ summary: Flexible Web Scraping and Content Extraction
118
+ test_files: []