spieker 0.0.3 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c787dd82625950fc0ce4eca39e86d29c021b5414
4
- data.tar.gz: 9825847bfbe9cb7a09961a26f191b88ff8b0ec0a
3
+ metadata.gz: 8d68fdae00bc592760e27214ce5b0546702034a9
4
+ data.tar.gz: 2451ccff0b6d42c869b1f3fbada18fc700806bf5
5
5
  SHA512:
6
- metadata.gz: 4f50c13bdac9438682de85294c0cad57848d0e7205be4d76d0d4809894b6338cb989ff833ec4e7304bfa9bb092a6ee52d7cfdac9aeb120b258c4dcc11622a59c
7
- data.tar.gz: 9dc328ea3b99435490d8620cc65f42604415592c826c8ff56ce897a107ca4f0d63881991ab411a7bacab58940acc893f082362380813e52e593700d85f5bfbb8
6
+ metadata.gz: 2689c5559124616dc3305271a479f1e52212788cfc116d3524283b6da38c8395e84ae2b64d7abc3cc18b5be45c277200d83e8669e73f757e222af338e7d5f420
7
+ data.tar.gz: 7a7c4fa8ac735de6440484861035000816cac7fdd2a19f29207dceb75912f638aa5e0ace15f76fb2a094ebe398a7341043620d693ac63ace5ce702b933b32073
data/CHANGELOG CHANGED
@@ -1,2 +1,8 @@
1
+ 0.0.7
2
+ * Support setting the language as a parameter, default is 'en'
3
+ * Wait for tolqjs to submit the content before continuing
4
+ 0.0.4
5
+ * Set useragent to Tolq Spieker
6
+
1
7
  0.0.3
2
8
  * Ignore pdf and javascript links
File without changes
data/README.md CHANGED
@@ -0,0 +1,43 @@
1
+ # Tolq Spieker
2
+
3
+ Tolq Spieker is a javascript friendly crawler. It uses selenium and
4
+ capybara to crawl a site.
5
+
6
+ ## Installation and usage
7
+
8
+ ```
9
+ $ gem install spieker
10
+ $ spieker <url>
11
+ ```
12
+
13
+ You can also use the library from your apps to do whatever you please.
14
+
15
+ ## TODO
16
+
17
+ * Be awesome in parallel
18
+ * Use poltergeist instead, does not work well by default
19
+
20
+ ## How it works
21
+
22
+ Spieker makes a request to the URL you provide. It scrapes all links,
23
+ filters any remote or non resource links, and crawls those on its own
24
+ terms. It tracks visited links in memory. When using the binary, it very
25
+ verbosely outputs this to STDOUT.
26
+
27
+ ## Changelog
28
+
29
+ See [Changelog](CHANGELOG)
30
+
31
+ ## Contribute
32
+
33
+ * fork
34
+ * test
35
+ * pull request
36
+
37
+ ### Contributors
38
+
39
+ * [Timon Vonk](https://www.github.com/timonv)
40
+
41
+ ## License
42
+
43
+ See [License](LICENSE)
@@ -2,4 +2,10 @@
2
2
 
3
3
  require 'spieker'
4
4
 
5
- Spieker::Crawler.new(ARGV[0], verbose: true).crawl!
5
+ if ARGV[0] == '--version' or ARGV[0] == '-v'
6
+ puts Spieker::VERSION
7
+ exit
8
+ end
9
+
10
+ lang = ARGV[1] && ARGV[1].length > 0 ? ARGV[1] : 'en'
11
+ Spieker::Crawler.new(ARGV[0], verbose: true, lang: lang).crawl!
@@ -1,15 +1,16 @@
1
1
  module Spieker
2
2
  class Crawler
3
- def initialize(url, verbose: false)
3
+ def initialize(url, verbose: false, lang: 'en')
4
4
  @url = url
5
5
  @tracked_links = []
6
6
  @verbose = verbose
7
+ @lang = lang
7
8
  end
8
9
 
9
10
  def crawl!
10
11
  report "Starting to crawl on #{@url}"
11
12
 
12
- scraper = LinkScraper.new(@url)
13
+ scraper = LinkScraper.new(@url, lang: @lang)
13
14
  track_link(@url)
14
15
  links = scraper.result
15
16
  recursively_crawl(links)
@@ -1,5 +1,6 @@
1
1
  require 'capybara'
2
- require 'capybara/poltergeist'
2
+ require 'capybara/selenium/driver'
3
+ require 'selenium/webdriver'
3
4
 
4
5
  module Spieker
5
6
  class LinkScraper
@@ -7,15 +8,19 @@ module Spieker
7
8
  include Capybara::DSL
8
9
  attr_writer :links
9
10
 
10
- def initialize(url)
11
+ def initialize(url, lang: 'en')
11
12
  @url = URI.parse(url)
13
+ @lang = lang
12
14
  Capybara.app_host = app_host
13
15
 
14
- Capybara.register_driver :poltergeist do |app|
15
- Capybara::Poltergeist::Driver.new(app, phantomjs_logger: NullStream.new )
16
+ Capybara.register_driver :tolq do |app|
17
+ profile = Selenium::WebDriver::Firefox::Profile.new
18
+ profile['general.useragent.override'] = "Mozilla/5.0 (compatible; Tolq Spieker/#{Spieker::VERSION}; +http://www.tolq.com)"
19
+
20
+ Capybara::Selenium::Driver.new(app, :profile => profile)
16
21
  end
17
22
 
18
- Capybara.current_driver = :selenium
23
+ Capybara.current_driver = :tolq
19
24
  end
20
25
 
21
26
  def result
@@ -34,10 +39,17 @@ module Spieker
34
39
 
35
40
  def drive_page_for_links
36
41
  begin
37
- visit @url.path
38
- page.all('a').map { |el| el[:href]}
39
- rescue
40
- puts "Error parsing #{@url.to_s}"
42
+ visit @url.path + "#!lang=#{@lang}"
43
+ links = page.all('a').map { |el| el[:href]}
44
+ begin
45
+ # Our javascript adds a class if the content has been succesfully submitted
46
+ page.find(:css, 'html.tolq-content-updated')
47
+ rescue Capybara::Ambiguous, Capybara::ElementNotFound => e
48
+ puts "Something went wrong with submitting the content #{e.inspect}"
49
+ end
50
+ links
51
+ rescue => e
52
+ puts "Error parsing #{@url.to_s}, #{e.inspect}"
41
53
  []
42
54
  end
43
55
  end
@@ -1,3 +1,3 @@
1
1
  module Spieker
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.7"
3
3
  end
@@ -2,21 +2,20 @@ require 'test_helper'
2
2
 
3
3
  class TestLinkScraper < Test::Unit::TestCase
4
4
  def setup
5
- @scraper = Spieker::LinkScraper.new('http://www.google.com')
5
+ @scraper = Spieker::LinkScraper.new('http://supersimple.st.tolq.com')
6
6
  end
7
7
 
8
8
  def test_apphost
9
- assert_equal 'http://www.google.com', @scraper.app_host
9
+ assert_equal 'http://supersimple.st.tolq.com', @scraper.app_host
10
10
  end
11
11
 
12
12
  def test_result
13
- return
14
- assert @scraper.result.any?
13
+ #assert @scraper.result.any?
15
14
  end
16
15
 
17
16
  def test_filter_local
18
17
  found_links = [
19
- 'http://www.google.com/local',
18
+ 'http://supersimple.st.tolq.com/local',
20
19
  '/local',
21
20
  '/local#justahash',
22
21
  '#justahash',
@@ -26,7 +25,7 @@ class TestLinkScraper < Test::Unit::TestCase
26
25
  'pdf'
27
26
  ]
28
27
  expected_links = [
29
- 'http://www.google.com/local',
28
+ 'http://supersimple.st.tolq.com/local',
30
29
  '/local',
31
30
  ''
32
31
  ]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spieker
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Timon Vonk
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-08-28 00:00:00.000000000 Z
11
+ date: 2013-09-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -105,7 +105,7 @@ files:
105
105
  - .gitignore
106
106
  - CHANGELOG
107
107
  - Gemfile
108
- - LICENSE.txt
108
+ - LICENSE
109
109
  - README.md
110
110
  - Rakefile
111
111
  - bin/spieker