spidr 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
@@ -1,98 +0,0 @@
1
- require 'nokogiri'
2
-
3
- module Spidr
4
- class Page
5
- #
6
- # The body of the response.
7
- #
8
- # @return [String]
9
- # The body of the response.
10
- #
11
- def body
12
- (response.body || '')
13
- end
14
-
15
- #
16
- # Returns a parsed document object for HTML, XML, RSS and Atom pages.
17
- #
18
- # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
19
- # The document that represents HTML or XML pages.
20
- # Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
21
- # the page could not be parsed properly.
22
- #
23
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
24
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
25
- #
26
- def doc
27
- unless body.empty?
28
- begin
29
- if html?
30
- @doc ||= Nokogiri::HTML(body, @url.to_s, content_charset)
31
- elsif (rss? || atom? || xml? || xsl?)
32
- @doc ||= Nokogiri::XML(body, @url.to_s, content_charset)
33
- end
34
- rescue
35
- end
36
- end
37
- end
38
-
39
- #
40
- # Searches the document for XPath or CSS Path paths.
41
- #
42
- # @param [Array<String>] paths
43
- # CSS or XPath expressions to search the document with.
44
- #
45
- # @return [Array]
46
- # The matched nodes from the document.
47
- # Returns an empty Array if no nodes were matched, or if the page
48
- # is not an HTML or XML document.
49
- #
50
- # @example
51
- # page.search('//a[@href]')
52
- #
53
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
54
- #
55
- def search(*paths)
56
- if doc
57
- doc.search(*paths)
58
- else
59
- []
60
- end
61
- end
62
-
63
- #
64
- # Searches for the first occurrence an XPath or CSS Path expression.
65
- #
66
- # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
67
- # The first matched node. Returns `nil` if no nodes could be matched,
68
- # or if the page is not a HTML or XML document.
69
- #
70
- # @example
71
- # page.at('//title')
72
- #
73
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
74
- #
75
- def at(*arguments)
76
- if doc
77
- doc.at(*arguments)
78
- end
79
- end
80
-
81
- alias / search
82
- alias % at
83
-
84
- #
85
- # The title of the HTML page.
86
- #
87
- # @return [String]
88
- # The inner-text of the title element of the page.
89
- #
90
- def title
91
- if (node = at('//title'))
92
- node.inner_text
93
- end
94
- end
95
-
96
- alias to_s body
97
- end
98
- end
@@ -1,34 +0,0 @@
1
- module Helpers
2
- module History
3
- def visited_once?(url)
4
- return @agent.visited_urls.select { |visited_url|
5
- visited_url == url
6
- }.length == 1
7
- end
8
-
9
- def visited_link?(url)
10
- @agent.visited?(url)
11
- end
12
-
13
- def visit_failed?(url)
14
- @agent.failed?(url)
15
- end
16
-
17
- def should_visit_link(url)
18
- expect(visited_link?(url)).to eq(true)
19
- end
20
-
21
- def should_ignore_link(url)
22
- expect(visited_link?(url)).to eq(false)
23
- end
24
-
25
- def should_visit_once(url)
26
- expect(visited_once?(url)).to eq(true)
27
- end
28
-
29
- def should_fail_link(url)
30
- expect(visited_link?(url)).to eq(false)
31
- expect(visit_failed?(url)).to eq(true)
32
- end
33
- end
34
- end
@@ -1,8 +0,0 @@
1
- require 'net/http'
2
- require 'uri'
3
-
4
- def get_page(url)
5
- url = URI(url.to_s)
6
-
7
- return Spidr::Page.new(url,Net::HTTP.get_response(url))
8
- end
@@ -1,83 +0,0 @@
1
- require 'wsoc/config'
2
- require 'open-uri'
3
- require 'json'
4
-
5
- require 'helpers/history'
6
-
7
- module Helpers
8
- module WSOC
9
- include History
10
-
11
- SERVER_URL = URI::HTTP.build(
12
- host: (ENV['HOST'] || ::WSOC::Config::DEFAULT_HOST),
13
- port: (ENV['PORT'] || ::WSOC::Config::DEFAULT_PORT)
14
- )
15
-
16
- SPECS_URL = SERVER_URL.merge(::WSOC::Config::SPECS_PATHS[:json])
17
-
18
- COURSE_URL = SERVER_URL.merge(::WSOC::Config::COURSE_START_PATH)
19
-
20
- COURSE_METADATA = {}
21
-
22
- def self.included(base)
23
- hash = JSON.parse(open(SPECS_URL).read)
24
- metadata = hash['metadata']
25
- specs = hash['specs']
26
-
27
- if metadata.kind_of?(Hash)
28
- COURSE_METADATA.merge!(metadata)
29
- end
30
-
31
- if specs.kind_of?(Array)
32
- specs.each do |spec|
33
- message = spec['message'].dump
34
- url = spec['url'].dump
35
-
36
- case spec['behavior']
37
- when 'visit'
38
- base.module_eval %{
39
- it #{message} do
40
- should_visit_link(#{url})
41
- end
42
- }
43
- when 'ignore'
44
- base.module_eval %{
45
- it #{message} do
46
- should_ignore_link(#{url})
47
- end
48
- }
49
- when 'fail'
50
- base.module_eval %{
51
- it #{message} do
52
- should_fail_link(#{url})
53
- end
54
- }
55
- end
56
- end
57
- end
58
- end
59
-
60
- def course
61
- WSOC::COURSE_METADATA
62
- end
63
-
64
- def course_auth_store
65
- course['auth_store']
66
- end
67
-
68
- def run_course
69
- Spidr::Agent.start_at(COURSE_URL) do |agent|
70
- course_auth_store.each do |path,auth|
71
- agent.authorized.add(
72
- COURSE_URL.merge(path),
73
- auth['user'],
74
- auth['password']
75
- )
76
- end
77
-
78
- agent.every_failed_url { |url| puts "[FAILED] #{url}" }
79
- agent.every_url { |url| puts url }
80
- end
81
- end
82
- end
83
- end
@@ -1,21 +0,0 @@
1
- require 'spidr/page'
2
-
3
- require 'spec_helper'
4
-
5
- shared_examples_for "Page" do
6
- it "should have a status code" do
7
- expect(@page.code).to be_integer
8
- end
9
-
10
- it "should have a body" do
11
- expect(@page.body).not_to be_empty
12
- end
13
-
14
- it "should provide transparent access to the response headers" do
15
- expect(@page.content_type).to eq(@page.response['Content-Type'])
16
- end
17
-
18
- it "should allow content-types" do
19
- expect(@page.content_types).not_to be_empty
20
- end
21
- end