Scrapah 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +32 -0
- data/Gemfile +5 -0
- data/Gemfile.lock +38 -0
- data/LICENSE +20 -0
- data/Rakefile +4 -0
- data/lib/scrapah.rb +16 -0
- data/lib/scrapah/cache.rb +72 -0
- data/lib/scrapah/extract.rb +45 -0
- data/lib/scrapah/scraper.rb +175 -0
- data/lib/scrapah/version.rb +4 -0
- data/readme.markdown +95 -0
- data/scrapah.gemspec +35 -0
- data/test/fixtures/profile.html +26 -0
- data/test/test_all.rb +11 -0
- data/test/test_cache.rb +51 -0
- data/test/test_extract.rb +34 -0
- data/test/test_scraper.rb +100 -0
- metadata +118 -0
data/.gitignore
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/test/tmp/
|
9
|
+
/test/version_tmp/
|
10
|
+
/tmp/
|
11
|
+
|
12
|
+
## Documentation cache and generated files:
|
13
|
+
/.yardoc/
|
14
|
+
/_yardoc/
|
15
|
+
/doc/
|
16
|
+
/rdoc/
|
17
|
+
|
18
|
+
## Environment normalisation:
|
19
|
+
/.bundle/
|
20
|
+
/lib/bundler/man/
|
21
|
+
|
22
|
+
# for a library or gem, you might want to ignore these files since the code is
|
23
|
+
# intended to run in multiple environments; otherwise, check them in:
|
24
|
+
# Gemfile.lock
|
25
|
+
# .ruby-version
|
26
|
+
# .ruby-gemset
|
27
|
+
|
28
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
29
|
+
.rvmrc
|
30
|
+
|
31
|
+
# Ignore Saved cache
|
32
|
+
*/cache/*
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
Scrapah (1.0.0)
|
5
|
+
headless (>= 1.0.1)
|
6
|
+
json (>= 1.8.1)
|
7
|
+
nokogiri (>= 1.6.0)
|
8
|
+
retryable (>= 1.3.3)
|
9
|
+
watir-webdriver (>= 0.6.4)
|
10
|
+
|
11
|
+
GEM
|
12
|
+
remote: https://rubygems.org/
|
13
|
+
specs:
|
14
|
+
childprocess (0.3.9)
|
15
|
+
ffi (~> 1.0, >= 1.0.11)
|
16
|
+
ffi (1.9.3)
|
17
|
+
headless (1.0.1)
|
18
|
+
json (1.8.1)
|
19
|
+
mini_portile (0.5.2)
|
20
|
+
multi_json (1.8.2)
|
21
|
+
nokogiri (1.6.0)
|
22
|
+
mini_portile (~> 0.5.0)
|
23
|
+
retryable (1.3.3)
|
24
|
+
rubyzip (1.0.0)
|
25
|
+
selenium-webdriver (2.37.0)
|
26
|
+
childprocess (>= 0.2.5)
|
27
|
+
multi_json (~> 1.0)
|
28
|
+
rubyzip (~> 1.0.0)
|
29
|
+
websocket (~> 1.0.4)
|
30
|
+
watir-webdriver (0.6.4)
|
31
|
+
selenium-webdriver (>= 2.18.0)
|
32
|
+
websocket (1.0.7)
|
33
|
+
|
34
|
+
PLATFORMS
|
35
|
+
ruby
|
36
|
+
|
37
|
+
DEPENDENCIES
|
38
|
+
Scrapah!
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2013 Simon Gruening
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
7
|
+
the Software without restriction, including without limitation the rights to
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
9
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
10
|
+
subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
17
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
18
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
19
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
20
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
data/lib/scrapah.rb
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
module Scrapah
|
6
|
+
|
7
|
+
class Cache
|
8
|
+
|
9
|
+
# TODO: 'throws away' whole cache after timeout,
|
10
|
+
# -> treat entries as seperate objects/files/dates
|
11
|
+
|
12
|
+
@@cache_dir = 'cache/'
|
13
|
+
|
14
|
+
def initialize()
|
15
|
+
Dir.mkdir(@@cache_dir) unless File.exists?(@@cache_dir)
|
16
|
+
@Cache = Hash.new
|
17
|
+
@keep_time = 1*24*60 # in minutes
|
18
|
+
end
|
19
|
+
|
20
|
+
def store(key,content)
|
21
|
+
@Cache[key] = content
|
22
|
+
end
|
23
|
+
|
24
|
+
def get(key)
|
25
|
+
@Cache[key]
|
26
|
+
end
|
27
|
+
|
28
|
+
def has_key?(key)
|
29
|
+
@Cache.has_key? key
|
30
|
+
end
|
31
|
+
|
32
|
+
def clear()
|
33
|
+
@Cache = Hash.new
|
34
|
+
end
|
35
|
+
|
36
|
+
def save
|
37
|
+
# WARNING: Symbols converted to Strings
|
38
|
+
f = File.new(@@cache_dir+Time.now.to_i.to_s,'w')
|
39
|
+
JSON.dump(@Cache,f)
|
40
|
+
f.close
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
def load
|
45
|
+
f = get_newest_acceptable
|
46
|
+
@Cache = Hash.new
|
47
|
+
@Cache = JSON.load(f) unless f.nil?
|
48
|
+
f.close
|
49
|
+
|
50
|
+
@Cache
|
51
|
+
end
|
52
|
+
|
53
|
+
def get_hash
|
54
|
+
@Cache
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def get_newest_acceptable()
|
60
|
+
prev = Dir.glob(@@cache_dir+'*')
|
61
|
+
if(!prev.empty?)
|
62
|
+
prev.map!{|f| f.delete(@@cache_dir).to_i}
|
63
|
+
prev.sort!
|
64
|
+
return File.new(@@cache_dir+prev.last.to_s,"r") if(Time.now.to_i-prev.last < @keep_time*60)
|
65
|
+
end
|
66
|
+
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
|
2
|
+
module Scrapah
|
3
|
+
|
4
|
+
class Extract
|
5
|
+
|
6
|
+
# These methods are Not Perfect, but Good enough
|
7
|
+
# They are designed to extract from plain text
|
8
|
+
# Non-obfuscated plain text...
|
9
|
+
# Please don't spam me =(
|
10
|
+
|
11
|
+
# TODO, make case insensitive and remove one set?
|
12
|
+
def self.emails(content)
|
13
|
+
r = Regexp.new(/\b([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4})\b/)
|
14
|
+
regex(content,r)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.ips(content)
|
18
|
+
# very simple IPv4 regex
|
19
|
+
r = Regexp.new(/\b((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))\b/)
|
20
|
+
regex(content,r)
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.proxies(content)
|
24
|
+
# ex. IPadress:port
|
25
|
+
r = Regexp.new(/\b((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\:[0-9]{1,5})\b/)
|
26
|
+
regex(content,r)
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.regex(content, regex)
|
30
|
+
# deals with nokogiri and misc
|
31
|
+
if content.respond_to?(:to_s) && !content.is_a?(String)
|
32
|
+
content = content.to_s
|
33
|
+
end
|
34
|
+
|
35
|
+
results = []
|
36
|
+
results << content.scan(regex)
|
37
|
+
results = results.flatten.uniq
|
38
|
+
|
39
|
+
results
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
@@ -0,0 +1,175 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
require 'retryable'
|
6
|
+
|
7
|
+
# TODO optional requires?
|
8
|
+
require 'open-uri'
|
9
|
+
require 'watir-webdriver'
|
10
|
+
require 'headless' # needs xvfb installed
|
11
|
+
|
12
|
+
|
13
|
+
module Scrapah
|
14
|
+
|
15
|
+
class Scraper
|
16
|
+
|
17
|
+
include Retryable
|
18
|
+
|
19
|
+
# TODO needs full url for caching to work properly atm
|
20
|
+
|
21
|
+
# TODO Patterns class, for recursive-autodiscovery proxy-switching etc... ?
|
22
|
+
|
23
|
+
def initialize(scrape_type=:openuri, caching=false)
|
24
|
+
@access_type = scrape_type
|
25
|
+
@current_url = ''
|
26
|
+
|
27
|
+
@caching = caching
|
28
|
+
if @caching
|
29
|
+
@cache = Scrapah::Cache.new
|
30
|
+
@cache.load
|
31
|
+
end
|
32
|
+
|
33
|
+
# .start automatically?
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
def start()
|
38
|
+
# start headless
|
39
|
+
if(@access_type == :headless)
|
40
|
+
@headless = Headless.new
|
41
|
+
@headless.start
|
42
|
+
@browser = Watir::Browser.new #default browser
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def stop()
|
47
|
+
# end headless/close stuff
|
48
|
+
if(@access_type == :headless)
|
49
|
+
@browser.close
|
50
|
+
@headless.destroy
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
def visit(url)
|
56
|
+
# cache the url
|
57
|
+
|
58
|
+
@current_url = url
|
59
|
+
|
60
|
+
return nil if !@caching
|
61
|
+
|
62
|
+
doc = get_appropriate(url)
|
63
|
+
|
64
|
+
@cache.store(url,doc.to_s)
|
65
|
+
@cache.save #TODO ???
|
66
|
+
end
|
67
|
+
|
68
|
+
def get(url)
|
69
|
+
# visit(url) if caching and not cached
|
70
|
+
# return result
|
71
|
+
@current_url = url
|
72
|
+
|
73
|
+
if(@caching)
|
74
|
+
go(url) if !@cache.has_key? url
|
75
|
+
Nokogiri::HTML(@cache.get(url))
|
76
|
+
else
|
77
|
+
get_appropriate(url)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# TODO split process! and process ....
|
82
|
+
def process(input)
|
83
|
+
# get current_url source
|
84
|
+
doc = get(@current_url)
|
85
|
+
|
86
|
+
if input.is_a?(Hash)
|
87
|
+
result = Hash.new
|
88
|
+
input.each{|k,v| result[k] = process_appropriate(doc,v)}
|
89
|
+
return result
|
90
|
+
else
|
91
|
+
return process_appropriate(doc,input)
|
92
|
+
end
|
93
|
+
|
94
|
+
nil
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
private
|
99
|
+
|
100
|
+
# TODO retry & retry strategies
|
101
|
+
# returns nokogiri doc's
|
102
|
+
def get_appropriate(url)
|
103
|
+
retryable :tries => 4, :sleep => 1.5 do
|
104
|
+
return get_headless(url) if(@access_type == :headless)
|
105
|
+
return get_openuri(url) if(@access_type == :openuri)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def get_headless(url)
|
110
|
+
return nil if !started_headless?
|
111
|
+
|
112
|
+
@browser.goto url
|
113
|
+
Nokogiri::HTML(@browser.html)
|
114
|
+
end
|
115
|
+
|
116
|
+
def get_openuri(url)
|
117
|
+
Nokogiri::HTML(open(url))
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
def started_headless?()
|
122
|
+
if @browser.nil? || @headless.nil?
|
123
|
+
raise 'Call Scraper.start first when using :headless'
|
124
|
+
return false
|
125
|
+
end
|
126
|
+
return true
|
127
|
+
end
|
128
|
+
|
129
|
+
|
130
|
+
# accepts nokogiri doc's only atm
|
131
|
+
def process_appropriate(doc,cmd)
|
132
|
+
|
133
|
+
return process_regex(doc,cmd) if(cmd.is_a? Regexp)
|
134
|
+
return process_proc(doc,cmd) if(cmd.is_a? Proc)
|
135
|
+
|
136
|
+
if cmd.is_a?(String)
|
137
|
+
return process_xpath(doc,cmd) if cmd.start_with?("x|")
|
138
|
+
return process_css(doc,cmd) if cmd.start_with?("c|")
|
139
|
+
end
|
140
|
+
|
141
|
+
nil
|
142
|
+
|
143
|
+
end
|
144
|
+
|
145
|
+
def process_regex(doc,regex)
|
146
|
+
doc.to_s.scan(regex).flatten
|
147
|
+
end
|
148
|
+
|
149
|
+
def process_xpath(doc,xpath)
|
150
|
+
xpath.slice!('x|')
|
151
|
+
sanitize_nokogiri doc.xpath(xpath)
|
152
|
+
end
|
153
|
+
|
154
|
+
def process_css(doc,css)
|
155
|
+
css.slice!('c|')
|
156
|
+
sanitize_nokogiri doc.css(css)
|
157
|
+
end
|
158
|
+
|
159
|
+
def process_proc(doc,proc)
|
160
|
+
proc.call(doc)
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
def sanitize_nokogiri(stuff)
|
165
|
+
return stuff.to_s if(stuff.count == 1)
|
166
|
+
|
167
|
+
result = []
|
168
|
+
stuff.each{|a| result << a.to_s}
|
169
|
+
result
|
170
|
+
end
|
171
|
+
|
172
|
+
|
173
|
+
end
|
174
|
+
|
175
|
+
end
|
data/readme.markdown
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
Scrapah
|
4
|
+
===
|
5
|
+
|
6
|
+
Scrapah is a ruby gem for web scraping and flexible content extraction. Scrapah takes your Hashes, regex, xpath's, css, and even Proc's, and turns them into neat results. Scrapah is heavily based on Nokogiri.
|
7
|
+
|
8
|
+
Installation
|
9
|
+
---
|
10
|
+
|
11
|
+
gem install Scrapah
|
12
|
+
|
13
|
+
# running in :headless requires xvfb to be installed on your system
|
14
|
+
# for details, see gem at https://github.com/leonid-shevtsov/headless
|
15
|
+
# > sudo apt-get install xvfb
|
16
|
+
|
17
|
+
|
18
|
+
Using Scrapah
|
19
|
+
---
|
20
|
+
|
21
|
+
**Start** a Scraper
|
22
|
+
|
23
|
+
require 'scrapah'
|
24
|
+
|
25
|
+
# use :headless to use a browser in background
|
26
|
+
s = Scrapah::Scraper.new :openuri
|
27
|
+
s.start
|
28
|
+
|
29
|
+
Tell Scrapah where to **go**
|
30
|
+
|
31
|
+
s.visit 'https://github.com/ZirconCode'
|
32
|
+
|
33
|
+
You can pass in a **regex**
|
34
|
+
|
35
|
+
# ex. get my total contributions on github
|
36
|
+
p s.process /\b(\d*?) Total\b/
|
37
|
+
|
38
|
+
Scrapah will treat a string beginning with 'c|' as a **CSS** selector
|
39
|
+
|
40
|
+
# ex. get the title tag off a page
|
41
|
+
p s.process 'c|h1'
|
42
|
+
|
43
|
+
Scrapah will treat a string beginning with 'x|' as an **XPath**
|
44
|
+
|
45
|
+
# ex. get my popular repositories
|
46
|
+
p s.process 'x|//span[@class="repo"]/text()'
|
47
|
+
|
48
|
+
Scrapah will even take your **Proc's**
|
49
|
+
|
50
|
+
# ex. Extract all emails from a website
|
51
|
+
p s.process Proc.new{|nokogiri_doc| Scrapah::Extract.emails nokogiri_doc}
|
52
|
+
|
53
|
+
The Magic: Pass in a **Hash** of stuff, get results
|
54
|
+
|
55
|
+
s.visit 'https://github.com/ZirconCode'
|
56
|
+
|
57
|
+
profile = Hash.new
|
58
|
+
profile[:name] = Proc.new{|d| 'The Great '+d.xpath('x|//span[@class="vcard-username"]/text()').to_s}
|
59
|
+
profile[:total_contrib] = /\b(\d*?) Total\b/
|
60
|
+
profile[:join_date] = 'x|//span[@class="join-date"]/text()'
|
61
|
+
profile[:popular_repos] = 'x|//span[@class="repo"]/text()'
|
62
|
+
profile[:website] = 'x|//a[@class="url"]/text()'
|
63
|
+
|
64
|
+
p s.process(profile) # ^_^
|
65
|
+
|
66
|
+
Don't forget to stop Scrapah at the end =)
|
67
|
+
|
68
|
+
s.stop
|
69
|
+
|
70
|
+
|
71
|
+
Development
|
72
|
+
---
|
73
|
+
|
74
|
+
# setup
|
75
|
+
git clone https://github.com/ZirconCode/Scrapah.git
|
76
|
+
cd Scrapah
|
77
|
+
bundle install
|
78
|
+
|
79
|
+
# testing
|
80
|
+
cd test/
|
81
|
+
ruby test_all.rb
|
82
|
+
|
83
|
+
# to install gem locally
|
84
|
+
rake install
|
85
|
+
|
86
|
+
# Pull requests are welcome =)
|
87
|
+
|
88
|
+
|
89
|
+
Todo
|
90
|
+
---
|
91
|
+
|
92
|
+
* !improve the caching
|
93
|
+
* Optional Requires (ex. watir-webdriver for some)
|
94
|
+
* proxy support
|
95
|
+
* recursive-autodiscovery, proxy-switching, and other 'patterns'
|
data/scrapah.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
$:.push File.expand_path("../lib", __FILE__)
|
4
|
+
require "scrapah/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
|
8
|
+
s.name = "Scrapah"
|
9
|
+
s.version = Scrapah::VERSION
|
10
|
+
s.platform = Gem::Platform::RUBY
|
11
|
+
s.authors = ["Simon Gruening"]
|
12
|
+
s.email = ["simon@zirconcode.com"]
|
13
|
+
s.license = 'MIT'
|
14
|
+
s.homepage = "https://github.com/ZirconCode/Scrapah"
|
15
|
+
s.summary = %q{Flexible Web Scraping and Content Extraction}
|
16
|
+
s.description = %q{A flexible web-scraper with built in content extraction}
|
17
|
+
|
18
|
+
|
19
|
+
s.required_ruby_version = '>= 1.9.3'
|
20
|
+
|
21
|
+
s.add_runtime_dependency 'nokogiri', '>= 1.6.0'
|
22
|
+
s.add_runtime_dependency 'retryable', '>= 1.3.3'
|
23
|
+
s.add_runtime_dependency 'json', '>= 1.8.1'
|
24
|
+
|
25
|
+
s.add_runtime_dependency 'watir-webdriver', '>= 0.6.4'
|
26
|
+
s.add_runtime_dependency 'headless', '>= 1.0.1' # needs xvfb installed
|
27
|
+
|
28
|
+
#s.add_development_dependency "rspec", "~>2.5.0"
|
29
|
+
|
30
|
+
s.files = `git ls-files`.split("\n")
|
31
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
32
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
33
|
+
s.require_paths = ["lib"]
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
|
2
|
+
<html>
|
3
|
+
|
4
|
+
<head>
|
5
|
+
|
6
|
+
</head>
|
7
|
+
|
8
|
+
<body>
|
9
|
+
|
10
|
+
|
11
|
+
Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla test@test.com pariatur?
|
12
|
+
|
13
|
+
<div class="profile">
|
14
|
+
<div class="name">Johnny Blah</div>
|
15
|
+
<span id="descr">Greatest of Johnnies</span>
|
16
|
+
<span id="descr">also is Great</span>
|
17
|
+
</div>
|
18
|
+
|
19
|
+
<p>
|
20
|
+
At 123.123.123.123 vero eos et accusamus et iusto odio dignissimos ducimus qui blanditiis praesentium voluptatum deleniti atque corrupti quos dolores et quas molestias excepturi sint occaecati cupiditate non provident, similique sunt in culpa qui officia deserunt mollitia animi, id est laborum et dolorum fuga. Et harum quidem rerum facilis est et expedita distinctio. Nam libero tempore, cum soluta nobis est eligendi optio cumque nihil impedit quo minus id quod maxime placeat facere possimus, omnis voluptas assumenda est, omnis dolor repellendus. Temporibus autem quibusdam et aut officiis debitis aut rerum necessitatibus saepe eveniet ut et voluptates repudiandae sint et molestiae non recusandae. Itaque earum rerum hic tenetur a sapiente delectus, ut aut reiciendis voluptatibus maiores alias consequatur aut perferendis doloribus asperiores repellat
|
21
|
+
</p>
|
22
|
+
|
23
|
+
</body>
|
24
|
+
|
25
|
+
|
26
|
+
</html>
|
data/test/test_all.rb
ADDED
data/test/test_cache.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
# Test cache.rb
|
4
|
+
|
5
|
+
require 'test/unit'
|
6
|
+
require '../lib/scrapah/cache'
|
7
|
+
|
8
|
+
|
9
|
+
class TestCache < Test::Unit::TestCase
|
10
|
+
|
11
|
+
def setup
|
12
|
+
@c = Scrapah::Cache.new
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
def test_new()
|
17
|
+
assert(@c.instance_of? Scrapah::Cache)
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_store_and_get
|
21
|
+
@c.store("test",3)
|
22
|
+
assert(@c.get("test") == 3)
|
23
|
+
|
24
|
+
assert(@c.get("doesn't exist").nil?)
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_has_key()
|
28
|
+
@c.store("test",3)
|
29
|
+
assert(@c.has_key? "test")
|
30
|
+
assert(!@c.has_key?("nope"))
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_clear()
|
34
|
+
@c.store("test",3)
|
35
|
+
|
36
|
+
assert(@c.get("test")==3)
|
37
|
+
@c.clear
|
38
|
+
assert(@c.get("test").nil?)
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_save_and_load()
|
42
|
+
@c.store("old",1)
|
43
|
+
@c.save
|
44
|
+
@c.clear
|
45
|
+
@c.load
|
46
|
+
assert(@c.get("old")==1)
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
|
@@ -0,0 +1,34 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
# Test extract.rb
|
4
|
+
|
5
|
+
require 'test/unit'
|
6
|
+
require '../lib/scrapah/extract'
|
7
|
+
|
8
|
+
|
9
|
+
class TestExtract < Test::Unit::TestCase
|
10
|
+
|
11
|
+
def test_emails
|
12
|
+
s = "test tes.com test@test.com tete@ .org"
|
13
|
+
assert(Scrapah::Extract.emails(s).count == 1)
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_ips
|
17
|
+
s = "654.123 123.143.114.123 123.143.654.123 123.143.654.12322"
|
18
|
+
assert(Scrapah::Extract.ips(s).count == 1)
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_proxies
|
22
|
+
s = "123.143. 654.123 112.143.114.123:4444 123.143.654.12322"
|
23
|
+
assert(Scrapah::Extract.proxies(s).count == 1)
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_regex
|
27
|
+
s = 'hello hall o ello olle o.O/'
|
28
|
+
assert(Scrapah::Extract.regex(s,/(.ll)/).count == 3)
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
|
@@ -0,0 +1,100 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
# Test scraper.rb
|
4
|
+
|
5
|
+
|
6
|
+
require 'test/unit'
|
7
|
+
require '../lib/scrapah/scraper'
|
8
|
+
require '../lib/scrapah/extract' # for test_process
|
9
|
+
|
10
|
+
|
11
|
+
class TestScraper < Test::Unit::TestCase
|
12
|
+
|
13
|
+
def setup
|
14
|
+
# Uses fixtures/profile.html as "web access"
|
15
|
+
@fixture_profile = Dir.pwd+"/fixtures/profile.html"
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
def test_new_openuri
|
20
|
+
s = nil
|
21
|
+
|
22
|
+
assert_nothing_raised do
|
23
|
+
s = Scrapah::Scraper.new :openuri
|
24
|
+
end
|
25
|
+
assert(s.is_a? Scrapah::Scraper)
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_get_openuri
|
29
|
+
s = Scrapah::Scraper.new :openuri
|
30
|
+
f = @fixture_profile
|
31
|
+
|
32
|
+
assert(s.get(f).to_s.include? 'Sed ut perspiciatis unde omnis')
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_new_headless
|
36
|
+
s = nil
|
37
|
+
|
38
|
+
assert_nothing_raised do
|
39
|
+
s = Scrapah::Scraper.new :headless
|
40
|
+
end
|
41
|
+
assert(s.is_a? Scrapah::Scraper)
|
42
|
+
end
|
43
|
+
|
44
|
+
# heavy test
|
45
|
+
def test_start_stop_headless
|
46
|
+
s = Scrapah::Scraper.new :headless
|
47
|
+
|
48
|
+
assert_nothing_raised do
|
49
|
+
s.start
|
50
|
+
s.stop
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# heavy test
|
55
|
+
def test_get_headless
|
56
|
+
s = Scrapah::Scraper.new :headless
|
57
|
+
f = 'file://'+@fixture_profile
|
58
|
+
|
59
|
+
s.start
|
60
|
+
|
61
|
+
assert(s.get(f).to_s.include? 'Sed ut perspiciatis unde omnis')
|
62
|
+
|
63
|
+
s.stop
|
64
|
+
end
|
65
|
+
|
66
|
+
# heavy test
|
67
|
+
def test_get_no_start_headless
|
68
|
+
s = Scrapah::Scraper.new :headless
|
69
|
+
|
70
|
+
assert_raise RuntimeError do
|
71
|
+
s.get('blah')
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# Full Use Test
|
76
|
+
def test_process
|
77
|
+
s = Scrapah::Scraper.new :openuri
|
78
|
+
f = @fixture_profile
|
79
|
+
|
80
|
+
s.visit(f)
|
81
|
+
|
82
|
+
assert(s.process(/Sed ut perspiciatis/).count == 1)
|
83
|
+
|
84
|
+
h = Hash.new
|
85
|
+
h[:regex] = /Sed ut perspiciatis/
|
86
|
+
h[:proc] = Proc.new{|d| Scrapah::Extract.emails(d)}
|
87
|
+
h[:css] = "c|span"
|
88
|
+
h[:xpath] = "x|/html/body/div/div"
|
89
|
+
|
90
|
+
result = s.process(h)
|
91
|
+
|
92
|
+
assert(result[:regex].count == 1)
|
93
|
+
assert(result[:proc].count == 1)
|
94
|
+
assert(result[:css].count == 2)
|
95
|
+
assert(result[:xpath].include? 'Johnny Blah')
|
96
|
+
end
|
97
|
+
|
98
|
+
|
99
|
+
end
|
100
|
+
|
metadata
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: Scrapah
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Simon Gruening
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-11-19 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: &79704310 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.6.0
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *79704310
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: retryable
|
27
|
+
requirement: &79704080 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.3.3
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *79704080
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: json
|
38
|
+
requirement: &79703850 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.8.1
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *79703850
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: watir-webdriver
|
49
|
+
requirement: &79703620 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.6.4
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *79703620
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: headless
|
60
|
+
requirement: &79703390 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: 1.0.1
|
66
|
+
type: :runtime
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *79703390
|
69
|
+
description: A flexible web-scraper with built in content extraction
|
70
|
+
email:
|
71
|
+
- simon@zirconcode.com
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files: []
|
75
|
+
files:
|
76
|
+
- .gitignore
|
77
|
+
- Gemfile
|
78
|
+
- Gemfile.lock
|
79
|
+
- LICENSE
|
80
|
+
- Rakefile
|
81
|
+
- lib/scrapah.rb
|
82
|
+
- lib/scrapah/cache.rb
|
83
|
+
- lib/scrapah/extract.rb
|
84
|
+
- lib/scrapah/scraper.rb
|
85
|
+
- lib/scrapah/version.rb
|
86
|
+
- readme.markdown
|
87
|
+
- scrapah.gemspec
|
88
|
+
- test/fixtures/profile.html
|
89
|
+
- test/test_all.rb
|
90
|
+
- test/test_cache.rb
|
91
|
+
- test/test_extract.rb
|
92
|
+
- test/test_scraper.rb
|
93
|
+
homepage: https://github.com/ZirconCode/Scrapah
|
94
|
+
licenses:
|
95
|
+
- MIT
|
96
|
+
post_install_message:
|
97
|
+
rdoc_options: []
|
98
|
+
require_paths:
|
99
|
+
- lib
|
100
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
101
|
+
none: false
|
102
|
+
requirements:
|
103
|
+
- - ! '>='
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: 1.9.3
|
106
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
107
|
+
none: false
|
108
|
+
requirements:
|
109
|
+
- - ! '>='
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0'
|
112
|
+
requirements: []
|
113
|
+
rubyforge_project:
|
114
|
+
rubygems_version: 1.8.11
|
115
|
+
signing_key:
|
116
|
+
specification_version: 3
|
117
|
+
summary: Flexible Web Scraping and Content Extraction
|
118
|
+
test_files: []
|