spidr 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +14 -0
  3. data/ChangeLog.md +20 -2
  4. data/Gemfile +2 -2
  5. data/README.md +4 -2
  6. data/Rakefile +1 -0
  7. data/gemspec.yml +1 -1
  8. data/lib/spidr/agent.rb +145 -85
  9. data/lib/spidr/agent/filters.rb +1 -9
  10. data/lib/spidr/agent/robots.rb +36 -0
  11. data/lib/spidr/page.rb +76 -28
  12. data/lib/spidr/page/{headers.rb → content_types.rb} +2 -147
  13. data/lib/spidr/page/cookies.rb +60 -0
  14. data/lib/spidr/page/{links.rb → html.rb} +47 -23
  15. data/lib/spidr/page/status_codes.rb +112 -0
  16. data/lib/spidr/proxy.rb +56 -0
  17. data/lib/spidr/session_cache.rb +60 -24
  18. data/lib/spidr/settings.rb +3 -0
  19. data/lib/spidr/settings/proxy.rb +61 -0
  20. data/lib/spidr/settings/timeouts.rb +33 -0
  21. data/lib/spidr/settings/user_agent.rb +14 -0
  22. data/lib/spidr/spidr.rb +15 -79
  23. data/lib/spidr/version.rb +1 -1
  24. data/spec/agent/actions_spec.rb +158 -32
  25. data/spec/agent/filters_spec.rb +46 -29
  26. data/spec/agent/sanitizers_spec.rb +25 -31
  27. data/spec/agent_spec.rb +772 -50
  28. data/spec/example_app.rb +27 -0
  29. data/spec/example_page.rb +33 -0
  30. data/spec/page/content_types_spec.rb +150 -0
  31. data/spec/page/cookies_spec.rb +58 -0
  32. data/spec/page/html_spec.rb +524 -0
  33. data/spec/page/status_codes_spec.rb +87 -0
  34. data/spec/page_spec.rb +114 -78
  35. data/spec/proxy_spec.rb +45 -0
  36. data/spec/session_cache.rb +103 -2
  37. data/spec/settings/proxy_examples.rb +82 -0
  38. data/spec/settings/timeouts_examples.rb +93 -0
  39. data/spec/settings/user_agent_examples.rb +25 -0
  40. data/spec/spidr_spec.rb +6 -29
  41. data/spidr.gemspec +38 -109
  42. metadata +35 -31
  43. data/lib/spidr/page/body.rb +0 -98
  44. data/spec/helpers/history.rb +0 -34
  45. data/spec/helpers/page.rb +0 -8
  46. data/spec/helpers/wsoc.rb +0 -83
  47. data/spec/page_examples.rb +0 -21
@@ -1,98 +0,0 @@
1
- require 'nokogiri'
2
-
3
- module Spidr
4
- class Page
5
- #
6
- # The body of the response.
7
- #
8
- # @return [String]
9
- # The body of the response.
10
- #
11
- def body
12
- (response.body || '')
13
- end
14
-
15
- #
16
- # Returns a parsed document object for HTML, XML, RSS and Atom pages.
17
- #
18
- # @return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]
19
- # The document that represents HTML or XML pages.
20
- # Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
21
- # the page could not be parsed properly.
22
- #
23
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
24
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
25
- #
26
- def doc
27
- unless body.empty?
28
- begin
29
- if html?
30
- @doc ||= Nokogiri::HTML(body, @url.to_s, content_charset)
31
- elsif (rss? || atom? || xml? || xsl?)
32
- @doc ||= Nokogiri::XML(body, @url.to_s, content_charset)
33
- end
34
- rescue
35
- end
36
- end
37
- end
38
-
39
- #
40
- # Searches the document for XPath or CSS Path paths.
41
- #
42
- # @param [Array<String>] paths
43
- # CSS or XPath expressions to search the document with.
44
- #
45
- # @return [Array]
46
- # The matched nodes from the document.
47
- # Returns an empty Array if no nodes were matched, or if the page
48
- # is not an HTML or XML document.
49
- #
50
- # @example
51
- # page.search('//a[@href]')
52
- #
53
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000239
54
- #
55
- def search(*paths)
56
- if doc
57
- doc.search(*paths)
58
- else
59
- []
60
- end
61
- end
62
-
63
- #
64
- # Searches for the first occurrence an XPath or CSS Path expression.
65
- #
66
- # @return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]
67
- # The first matched node. Returns `nil` if no nodes could be matched,
68
- # or if the page is not a HTML or XML document.
69
- #
70
- # @example
71
- # page.at('//title')
72
- #
73
- # @see http://nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251
74
- #
75
- def at(*arguments)
76
- if doc
77
- doc.at(*arguments)
78
- end
79
- end
80
-
81
- alias / search
82
- alias % at
83
-
84
- #
85
- # The title of the HTML page.
86
- #
87
- # @return [String]
88
- # The inner-text of the title element of the page.
89
- #
90
- def title
91
- if (node = at('//title'))
92
- node.inner_text
93
- end
94
- end
95
-
96
- alias to_s body
97
- end
98
- end
@@ -1,34 +0,0 @@
1
- module Helpers
2
- module History
3
- def visited_once?(url)
4
- return @agent.visited_urls.select { |visited_url|
5
- visited_url == url
6
- }.length == 1
7
- end
8
-
9
- def visited_link?(url)
10
- @agent.visited?(url)
11
- end
12
-
13
- def visit_failed?(url)
14
- @agent.failed?(url)
15
- end
16
-
17
- def should_visit_link(url)
18
- expect(visited_link?(url)).to eq(true)
19
- end
20
-
21
- def should_ignore_link(url)
22
- expect(visited_link?(url)).to eq(false)
23
- end
24
-
25
- def should_visit_once(url)
26
- expect(visited_once?(url)).to eq(true)
27
- end
28
-
29
- def should_fail_link(url)
30
- expect(visited_link?(url)).to eq(false)
31
- expect(visit_failed?(url)).to eq(true)
32
- end
33
- end
34
- end
@@ -1,8 +0,0 @@
1
- require 'net/http'
2
- require 'uri'
3
-
4
- def get_page(url)
5
- url = URI(url.to_s)
6
-
7
- return Spidr::Page.new(url,Net::HTTP.get_response(url))
8
- end
@@ -1,83 +0,0 @@
1
- require 'wsoc/config'
2
- require 'open-uri'
3
- require 'json'
4
-
5
- require 'helpers/history'
6
-
7
- module Helpers
8
- module WSOC
9
- include History
10
-
11
- SERVER_URL = URI::HTTP.build(
12
- host: (ENV['HOST'] || ::WSOC::Config::DEFAULT_HOST),
13
- port: (ENV['PORT'] || ::WSOC::Config::DEFAULT_PORT)
14
- )
15
-
16
- SPECS_URL = SERVER_URL.merge(::WSOC::Config::SPECS_PATHS[:json])
17
-
18
- COURSE_URL = SERVER_URL.merge(::WSOC::Config::COURSE_START_PATH)
19
-
20
- COURSE_METADATA = {}
21
-
22
- def self.included(base)
23
- hash = JSON.parse(open(SPECS_URL).read)
24
- metadata = hash['metadata']
25
- specs = hash['specs']
26
-
27
- if metadata.kind_of?(Hash)
28
- COURSE_METADATA.merge!(metadata)
29
- end
30
-
31
- if specs.kind_of?(Array)
32
- specs.each do |spec|
33
- message = spec['message'].dump
34
- url = spec['url'].dump
35
-
36
- case spec['behavior']
37
- when 'visit'
38
- base.module_eval %{
39
- it #{message} do
40
- should_visit_link(#{url})
41
- end
42
- }
43
- when 'ignore'
44
- base.module_eval %{
45
- it #{message} do
46
- should_ignore_link(#{url})
47
- end
48
- }
49
- when 'fail'
50
- base.module_eval %{
51
- it #{message} do
52
- should_fail_link(#{url})
53
- end
54
- }
55
- end
56
- end
57
- end
58
- end
59
-
60
- def course
61
- WSOC::COURSE_METADATA
62
- end
63
-
64
- def course_auth_store
65
- course['auth_store']
66
- end
67
-
68
- def run_course
69
- Spidr::Agent.start_at(COURSE_URL) do |agent|
70
- course_auth_store.each do |path,auth|
71
- agent.authorized.add(
72
- COURSE_URL.merge(path),
73
- auth['user'],
74
- auth['password']
75
- )
76
- end
77
-
78
- agent.every_failed_url { |url| puts "[FAILED] #{url}" }
79
- agent.every_url { |url| puts url }
80
- end
81
- end
82
- end
83
- end
@@ -1,21 +0,0 @@
1
- require 'spidr/page'
2
-
3
- require 'spec_helper'
4
-
5
- shared_examples_for "Page" do
6
- it "should have a status code" do
7
- expect(@page.code).to be_integer
8
- end
9
-
10
- it "should have a body" do
11
- expect(@page.body).not_to be_empty
12
- end
13
-
14
- it "should provide transparent access to the response headers" do
15
- expect(@page.content_type).to eq(@page.response['Content-Type'])
16
- end
17
-
18
- it "should allow content-types" do
19
- expect(@page.content_types).not_to be_empty
20
- end
21
- end