spieker 0.0.3 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c787dd82625950fc0ce4eca39e86d29c021b5414
4
- data.tar.gz: 9825847bfbe9cb7a09961a26f191b88ff8b0ec0a
3
+ metadata.gz: 8d68fdae00bc592760e27214ce5b0546702034a9
4
+ data.tar.gz: 2451ccff0b6d42c869b1f3fbada18fc700806bf5
5
5
  SHA512:
6
- metadata.gz: 4f50c13bdac9438682de85294c0cad57848d0e7205be4d76d0d4809894b6338cb989ff833ec4e7304bfa9bb092a6ee52d7cfdac9aeb120b258c4dcc11622a59c
7
- data.tar.gz: 9dc328ea3b99435490d8620cc65f42604415592c826c8ff56ce897a107ca4f0d63881991ab411a7bacab58940acc893f082362380813e52e593700d85f5bfbb8
6
+ metadata.gz: 2689c5559124616dc3305271a479f1e52212788cfc116d3524283b6da38c8395e84ae2b64d7abc3cc18b5be45c277200d83e8669e73f757e222af338e7d5f420
7
+ data.tar.gz: 7a7c4fa8ac735de6440484861035000816cac7fdd2a19f29207dceb75912f638aa5e0ace15f76fb2a094ebe398a7341043620d693ac63ace5ce702b933b32073
data/CHANGELOG CHANGED
@@ -1,2 +1,8 @@
1
+ 0.0.7
2
+ * Support setting the language as a parameter, default is 'en'
3
+ * Wait for tolqjs to submit the content before continuing
4
+ 0.0.4
5
+ * Set useragent to Tolq Spieker
6
+
1
7
  0.0.3
2
8
  * Ignore pdf and javascript links
File without changes
data/README.md CHANGED
@@ -0,0 +1,43 @@
1
+ # Tolq Spieker
2
+
3
+ Tolq Spieker is a javascript friendly crawler. It uses selenium and
4
+ capybara to crawl a site.
5
+
6
+ ## Installation and usage
7
+
8
+ ```
9
+ $ gem install spieker
10
+ $ spieker <url>
11
+ ```
12
+
13
+ You can also use the library from your apps to do whatever you please.
14
+
15
+ ## TODO
16
+
17
+ * Be awesome in parallel
18
+ * Use poltergeist instead, does not work well by default
19
+
20
+ ## How it works
21
+
22
+ Spieker makes a request to the URL you provide. It scrapes all links,
23
+ filters any remote or non resource links, and crawls those on its own
24
+ terms. It tracks visited links in memory. When using the binary, it very
25
+ verbosely outputs this to STDOUT.
26
+
27
+ ## Changelog
28
+
29
+ See [Changelog](CHANGELOG)
30
+
31
+ ## Contribute
32
+
33
+ * fork
34
+ * test
35
+ * pull request
36
+
37
+ ### Contributors
38
+
39
+ * [Timon Vonk](https://www.github.com/timonv)
40
+
41
+ ## License
42
+
43
+ See [License](LICENSE)
@@ -2,4 +2,10 @@
2
2
 
3
3
  require 'spieker'
4
4
 
5
- Spieker::Crawler.new(ARGV[0], verbose: true).crawl!
5
+ if ARGV[0] == '--version' or ARGV[0] == '-v'
6
+ puts Spieker::VERSION
7
+ exit
8
+ end
9
+
10
+ lang = ARGV[1] && ARGV[1].length > 0 ? ARGV[1] : 'en'
11
+ Spieker::Crawler.new(ARGV[0], verbose: true, lang: lang).crawl!
@@ -1,15 +1,16 @@
1
1
  module Spieker
2
2
  class Crawler
3
- def initialize(url, verbose: false)
3
+ def initialize(url, verbose: false, lang: 'en')
4
4
  @url = url
5
5
  @tracked_links = []
6
6
  @verbose = verbose
7
+ @lang = lang
7
8
  end
8
9
 
9
10
  def crawl!
10
11
  report "Starting to crawl on #{@url}"
11
12
 
12
- scraper = LinkScraper.new(@url)
13
+ scraper = LinkScraper.new(@url, lang: @lang)
13
14
  track_link(@url)
14
15
  links = scraper.result
15
16
  recursively_crawl(links)
@@ -1,5 +1,6 @@
1
1
  require 'capybara'
2
- require 'capybara/poltergeist'
2
+ require 'capybara/selenium/driver'
3
+ require 'selenium/webdriver'
3
4
 
4
5
  module Spieker
5
6
  class LinkScraper
@@ -7,15 +8,19 @@ module Spieker
7
8
  include Capybara::DSL
8
9
  attr_writer :links
9
10
 
10
- def initialize(url)
11
+ def initialize(url, lang: 'en')
11
12
  @url = URI.parse(url)
13
+ @lang = lang
12
14
  Capybara.app_host = app_host
13
15
 
14
- Capybara.register_driver :poltergeist do |app|
15
- Capybara::Poltergeist::Driver.new(app, phantomjs_logger: NullStream.new )
16
+ Capybara.register_driver :tolq do |app|
17
+ profile = Selenium::WebDriver::Firefox::Profile.new
18
+ profile['general.useragent.override'] = "Mozilla/5.0 (compatible; Tolq Spieker/#{Spieker::VERSION}; +http://www.tolq.com)"
19
+
20
+ Capybara::Selenium::Driver.new(app, :profile => profile)
16
21
  end
17
22
 
18
- Capybara.current_driver = :selenium
23
+ Capybara.current_driver = :tolq
19
24
  end
20
25
 
21
26
  def result
@@ -34,10 +39,17 @@ module Spieker
34
39
 
35
40
  def drive_page_for_links
36
41
  begin
37
- visit @url.path
38
- page.all('a').map { |el| el[:href]}
39
- rescue
40
- puts "Error parsing #{@url.to_s}"
42
+ visit @url.path + "#!lang=#{@lang}"
43
+ links = page.all('a').map { |el| el[:href]}
44
+ begin
45
+ # Our javascript adds a class if the content has been succesfully submitted
46
+ page.find(:css, 'html.tolq-content-updated')
47
+ rescue Capybara::Ambiguous, Capybara::ElementNotFound => e
48
+ puts "Something went wrong with submitting the content #{e.inspect}"
49
+ end
50
+ links
51
+ rescue => e
52
+ puts "Error parsing #{@url.to_s}, #{e.inspect}"
41
53
  []
42
54
  end
43
55
  end
@@ -1,3 +1,3 @@
1
1
  module Spieker
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.7"
3
3
  end
@@ -2,21 +2,20 @@ require 'test_helper'
2
2
 
3
3
  class TestLinkScraper < Test::Unit::TestCase
4
4
  def setup
5
- @scraper = Spieker::LinkScraper.new('http://www.google.com')
5
+ @scraper = Spieker::LinkScraper.new('http://supersimple.st.tolq.com')
6
6
  end
7
7
 
8
8
  def test_apphost
9
- assert_equal 'http://www.google.com', @scraper.app_host
9
+ assert_equal 'http://supersimple.st.tolq.com', @scraper.app_host
10
10
  end
11
11
 
12
12
  def test_result
13
- return
14
- assert @scraper.result.any?
13
+ #assert @scraper.result.any?
15
14
  end
16
15
 
17
16
  def test_filter_local
18
17
  found_links = [
19
- 'http://www.google.com/local',
18
+ 'http://supersimple.st.tolq.com/local',
20
19
  '/local',
21
20
  '/local#justahash',
22
21
  '#justahash',
@@ -26,7 +25,7 @@ class TestLinkScraper < Test::Unit::TestCase
26
25
  'pdf'
27
26
  ]
28
27
  expected_links = [
29
- 'http://www.google.com/local',
28
+ 'http://supersimple.st.tolq.com/local',
30
29
  '/local',
31
30
  ''
32
31
  ]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spieker
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Timon Vonk
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-08-28 00:00:00.000000000 Z
11
+ date: 2013-09-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -105,7 +105,7 @@ files:
105
105
  - .gitignore
106
106
  - CHANGELOG
107
107
  - Gemfile
108
- - LICENSE.txt
108
+ - LICENSE
109
109
  - README.md
110
110
  - Rakefile
111
111
  - bin/spieker