spieker 0.0.3 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +6 -0
- data/{LICENSE.txt → LICENSE} +0 -0
- data/README.md +43 -0
- data/bin/spieker +7 -1
- data/lib/spieker/crawler.rb +3 -2
- data/lib/spieker/link_scraper.rb +21 -9
- data/lib/spieker/version.rb +1 -1
- data/test/link_scraper_test.rb +5 -6
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8d68fdae00bc592760e27214ce5b0546702034a9
|
4
|
+
data.tar.gz: 2451ccff0b6d42c869b1f3fbada18fc700806bf5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2689c5559124616dc3305271a479f1e52212788cfc116d3524283b6da38c8395e84ae2b64d7abc3cc18b5be45c277200d83e8669e73f757e222af338e7d5f420
|
7
|
+
data.tar.gz: 7a7c4fa8ac735de6440484861035000816cac7fdd2a19f29207dceb75912f638aa5e0ace15f76fb2a094ebe398a7341043620d693ac63ace5ce702b933b32073
|
data/CHANGELOG
CHANGED
data/{LICENSE.txt → LICENSE}
RENAMED
File without changes
|
data/README.md
CHANGED
@@ -0,0 +1,43 @@
|
|
1
|
+
# Tolq Spieker
|
2
|
+
|
3
|
+
Tolq Spieker is a javascript friendly crawler. It uses selenium and
|
4
|
+
capybara to crawl a site.
|
5
|
+
|
6
|
+
## Installation and usage
|
7
|
+
|
8
|
+
```
|
9
|
+
$ gem install spieker
|
10
|
+
$ spieker <url>
|
11
|
+
```
|
12
|
+
|
13
|
+
You can also use the library from your apps to do whatever you please.
|
14
|
+
|
15
|
+
## TODO
|
16
|
+
|
17
|
+
* Be awesome in parallel
|
18
|
+
* Use poltergeist instead, does not work well by default
|
19
|
+
|
20
|
+
## How it works
|
21
|
+
|
22
|
+
Spieker makes a request to the URL you provide. It scrapes all links,
|
23
|
+
filters any remote or non resource links, and crawls those on its own
|
24
|
+
terms. It tracks visited links in memory. When using the binary, it very
|
25
|
+
verbosely outputs this to STDOUT.
|
26
|
+
|
27
|
+
## Changelog
|
28
|
+
|
29
|
+
See [Changelog](CHANGELOG)
|
30
|
+
|
31
|
+
## Contribute
|
32
|
+
|
33
|
+
* fork
|
34
|
+
* test
|
35
|
+
* pull request
|
36
|
+
|
37
|
+
### Contributors
|
38
|
+
|
39
|
+
* [Timon Vonk](https://www.github.com/timonv)
|
40
|
+
|
41
|
+
## License
|
42
|
+
|
43
|
+
See [License](LICENSE)
|
data/bin/spieker
CHANGED
@@ -2,4 +2,10 @@
|
|
2
2
|
|
3
3
|
require 'spieker'
|
4
4
|
|
5
|
-
|
5
|
+
if ARGV[0] == '--version' or ARGV[0] == '-v'
|
6
|
+
puts Spieker::VERSION
|
7
|
+
exit
|
8
|
+
end
|
9
|
+
|
10
|
+
lang = ARGV[1] && ARGV[1].length > 0 ? ARGV[1] : 'en'
|
11
|
+
Spieker::Crawler.new(ARGV[0], verbose: true, lang: lang).crawl!
|
data/lib/spieker/crawler.rb
CHANGED
@@ -1,15 +1,16 @@
|
|
1
1
|
module Spieker
|
2
2
|
class Crawler
|
3
|
-
def initialize(url, verbose: false)
|
3
|
+
def initialize(url, verbose: false, lang: 'en')
|
4
4
|
@url = url
|
5
5
|
@tracked_links = []
|
6
6
|
@verbose = verbose
|
7
|
+
@lang = lang
|
7
8
|
end
|
8
9
|
|
9
10
|
def crawl!
|
10
11
|
report "Starting to crawl on #{@url}"
|
11
12
|
|
12
|
-
scraper = LinkScraper.new(@url)
|
13
|
+
scraper = LinkScraper.new(@url, lang: @lang)
|
13
14
|
track_link(@url)
|
14
15
|
links = scraper.result
|
15
16
|
recursively_crawl(links)
|
data/lib/spieker/link_scraper.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'capybara'
|
2
|
-
require 'capybara/
|
2
|
+
require 'capybara/selenium/driver'
|
3
|
+
require 'selenium/webdriver'
|
3
4
|
|
4
5
|
module Spieker
|
5
6
|
class LinkScraper
|
@@ -7,15 +8,19 @@ module Spieker
|
|
7
8
|
include Capybara::DSL
|
8
9
|
attr_writer :links
|
9
10
|
|
10
|
-
def initialize(url)
|
11
|
+
def initialize(url, lang: 'en')
|
11
12
|
@url = URI.parse(url)
|
13
|
+
@lang = lang
|
12
14
|
Capybara.app_host = app_host
|
13
15
|
|
14
|
-
Capybara.register_driver :
|
15
|
-
|
16
|
+
Capybara.register_driver :tolq do |app|
|
17
|
+
profile = Selenium::WebDriver::Firefox::Profile.new
|
18
|
+
profile['general.useragent.override'] = "Mozilla/5.0 (compatible; Tolq Spieker/#{Spieker::VERSION}; +http://www.tolq.com)"
|
19
|
+
|
20
|
+
Capybara::Selenium::Driver.new(app, :profile => profile)
|
16
21
|
end
|
17
22
|
|
18
|
-
Capybara.current_driver = :
|
23
|
+
Capybara.current_driver = :tolq
|
19
24
|
end
|
20
25
|
|
21
26
|
def result
|
@@ -34,10 +39,17 @@ module Spieker
|
|
34
39
|
|
35
40
|
def drive_page_for_links
|
36
41
|
begin
|
37
|
-
visit @url.path
|
38
|
-
page.all('a').map { |el| el[:href]}
|
39
|
-
|
40
|
-
|
42
|
+
visit @url.path + "#!lang=#{@lang}"
|
43
|
+
links = page.all('a').map { |el| el[:href]}
|
44
|
+
begin
|
45
|
+
# Our javascript adds a class if the content has been succesfully submitted
|
46
|
+
page.find(:css, 'html.tolq-content-updated')
|
47
|
+
rescue Capybara::Ambiguous, Capybara::ElementNotFound => e
|
48
|
+
puts "Something went wrong with submitting the content #{e.inspect}"
|
49
|
+
end
|
50
|
+
links
|
51
|
+
rescue => e
|
52
|
+
puts "Error parsing #{@url.to_s}, #{e.inspect}"
|
41
53
|
[]
|
42
54
|
end
|
43
55
|
end
|
data/lib/spieker/version.rb
CHANGED
data/test/link_scraper_test.rb
CHANGED
@@ -2,21 +2,20 @@ require 'test_helper'
|
|
2
2
|
|
3
3
|
class TestLinkScraper < Test::Unit::TestCase
|
4
4
|
def setup
|
5
|
-
@scraper = Spieker::LinkScraper.new('http://
|
5
|
+
@scraper = Spieker::LinkScraper.new('http://supersimple.st.tolq.com')
|
6
6
|
end
|
7
7
|
|
8
8
|
def test_apphost
|
9
|
-
assert_equal 'http://
|
9
|
+
assert_equal 'http://supersimple.st.tolq.com', @scraper.app_host
|
10
10
|
end
|
11
11
|
|
12
12
|
def test_result
|
13
|
-
|
14
|
-
assert @scraper.result.any?
|
13
|
+
#assert @scraper.result.any?
|
15
14
|
end
|
16
15
|
|
17
16
|
def test_filter_local
|
18
17
|
found_links = [
|
19
|
-
'http://
|
18
|
+
'http://supersimple.st.tolq.com/local',
|
20
19
|
'/local',
|
21
20
|
'/local#justahash',
|
22
21
|
'#justahash',
|
@@ -26,7 +25,7 @@ class TestLinkScraper < Test::Unit::TestCase
|
|
26
25
|
'pdf'
|
27
26
|
]
|
28
27
|
expected_links = [
|
29
|
-
'http://
|
28
|
+
'http://supersimple.st.tolq.com/local',
|
30
29
|
'/local',
|
31
30
|
''
|
32
31
|
]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spieker
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Timon Vonk
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-09-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -105,7 +105,7 @@ files:
|
|
105
105
|
- .gitignore
|
106
106
|
- CHANGELOG
|
107
107
|
- Gemfile
|
108
|
-
- LICENSE
|
108
|
+
- LICENSE
|
109
109
|
- README.md
|
110
110
|
- Rakefile
|
111
111
|
- bin/spieker
|