spidr_epg 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +10 -0
  3. data/.rspec +1 -0
  4. data/.yardopts +1 -0
  5. data/ChangeLog.md +291 -0
  6. data/ChangeLog.md~ +291 -0
  7. data/Gemfile +16 -0
  8. data/Gemfile.lock +49 -0
  9. data/Gemfile~ +16 -0
  10. data/LICENSE.txt +20 -0
  11. data/README.md +193 -0
  12. data/README.md~ +190 -0
  13. data/Rakefile +29 -0
  14. data/gemspec.yml +19 -0
  15. data/lib/spidr/actions/actions.rb +83 -0
  16. data/lib/spidr/actions/exceptions/action.rb +9 -0
  17. data/lib/spidr/actions/exceptions/paused.rb +11 -0
  18. data/lib/spidr/actions/exceptions/skip_link.rb +12 -0
  19. data/lib/spidr/actions/exceptions/skip_page.rb +12 -0
  20. data/lib/spidr/actions/exceptions.rb +4 -0
  21. data/lib/spidr/actions.rb +2 -0
  22. data/lib/spidr/agent.rb +866 -0
  23. data/lib/spidr/auth_credential.rb +28 -0
  24. data/lib/spidr/auth_store.rb +161 -0
  25. data/lib/spidr/body.rb +98 -0
  26. data/lib/spidr/cookie_jar.rb +202 -0
  27. data/lib/spidr/events.rb +537 -0
  28. data/lib/spidr/extensions/uri.rb +52 -0
  29. data/lib/spidr/extensions.rb +1 -0
  30. data/lib/spidr/filters.rb +539 -0
  31. data/lib/spidr/headers.rb +370 -0
  32. data/lib/spidr/links.rb +229 -0
  33. data/lib/spidr/page.rb +108 -0
  34. data/lib/spidr/rules.rb +79 -0
  35. data/lib/spidr/sanitizers.rb +56 -0
  36. data/lib/spidr/session_cache.rb +145 -0
  37. data/lib/spidr/spidr.rb +107 -0
  38. data/lib/spidr/version.rb +4 -0
  39. data/lib/spidr/version.rb~ +4 -0
  40. data/lib/spidr.rb +3 -0
  41. data/pkg/spidr-1.0.0.gem +0 -0
  42. data/spec/actions_spec.rb +59 -0
  43. data/spec/agent_spec.rb +81 -0
  44. data/spec/auth_store_spec.rb +85 -0
  45. data/spec/cookie_jar_spec.rb +144 -0
  46. data/spec/extensions/uri_spec.rb +43 -0
  47. data/spec/filters_spec.rb +61 -0
  48. data/spec/helpers/history.rb +34 -0
  49. data/spec/helpers/page.rb +8 -0
  50. data/spec/helpers/wsoc.rb +83 -0
  51. data/spec/page_examples.rb +21 -0
  52. data/spec/page_spec.rb +125 -0
  53. data/spec/rules_spec.rb +45 -0
  54. data/spec/sanitizers_spec.rb +61 -0
  55. data/spec/session_cache.rb +58 -0
  56. data/spec/spec_helper.rb +4 -0
  57. data/spec/spidr_spec.rb +39 -0
  58. data/spidr.gemspec +133 -0
  59. data/spidr.gemspec~ +131 -0
  60. metadata +158 -0
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2008-2011 Hal Brodigan
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ 'Software'), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,193 @@
1
+ # 在这个gems基础上对它的源码进行了部分的修改已符合自己的需求
2
+
3
+
4
+ # Spidr
5
+
6
+ * [Homepage](http://spidr.rubyforge.org/)
7
+ * [Source](http://github.com/postmodern/spidr)
8
+ * [Issues](http://github.com/postmodern/spidr/issues)
9
+ * [Mailing List](http://groups.google.com/group/spidr)
10
+ * [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
11
+
12
+ ## Description
13
+
14
+ Spidr is a versatile Ruby web spidering library that can spider a site,
15
+ multiple domains, certain links or infinitely. Spidr is designed to be fast
16
+ and easy to use.
17
+
18
+ ## Features
19
+
20
+ * Follows:
21
+ * a tags.
22
+ * iframe tags.
23
+ * frame tags.
24
+ * Cookie protected links.
25
+ * HTTP 300, 301, 302, 303 and 307 Redirects.
26
+ * Meta-Refresh Redirects.
27
+ * HTTP Basic Auth protected links.
28
+ * Black-list or white-list URLs based upon:
29
+ * URL scheme.
30
+ * Host name
31
+ * Port number
32
+ * Full link
33
+ * URL extension
34
+ * Provides call-backs for:
35
+ * Every visited Page.
36
+ * Every visited URL.
37
+ * Every visited URL that matches a specified pattern.
38
+ * Every origin and destination URI of a link.
39
+ * Every URL that failed to be visited.
40
+ * Provides action methods to:
41
+ * Pause spidering.
42
+ * Skip processing of pages.
43
+ * Skip processing of links.
44
+ * Restore the spidering queue and history from a previous session.
45
+ * Custom User-Agent strings.
46
+ * Custom proxy settings.
47
+ * HTTPS support.
48
+
49
+ ## Examples
50
+
51
+ Start spidering from a URL:
52
+
53
+ Spidr.start_at('http://tenderlovemaking.com/')
54
+
55
+ Spider a host:
56
+
57
+ Spidr.host('coderrr.wordpress.com')
58
+
59
+ Spider a site:
60
+
61
+ Spidr.site('http://rubyflow.com/')
62
+
63
+ Spider multiple hosts:
64
+
65
+ Spidr.start_at(
66
+ 'http://company.com/',
67
+ :hosts => [
68
+ 'company.com',
69
+ /host\d\.company\.com/
70
+ ]
71
+ )
72
+
73
+ Do not spider certain links:
74
+
75
+ Spidr.site('http://matasano.com/', :ignore_links => [/log/])
76
+
77
+ Do not spider links on certain ports:
78
+
79
+ Spidr.site(
80
+ 'http://sketchy.content.com/',
81
+ :ignore_ports => [8000, 8010, 8080]
82
+ )
83
+
84
+ Print out visited URLs:
85
+
86
+ Spidr.site('http://rubyinside.org/') do |spider|
87
+ spider.every_url { |url| puts url }
88
+ end
89
+
90
+ Build a URL map of a site:
91
+
92
+ url_map = Hash.new { |hash,key| hash[key] = [] }
93
+
94
+ Spidr.site('http://intranet.com/') do |spider|
95
+ spider.every_link do |origin,dest|
96
+ url_map[dest] << origin
97
+ end
98
+ end
99
+
100
+ Print out the URLs that could not be requested:
101
+
102
+ Spidr.site('http://sketchy.content.com/') do |spider|
103
+ spider.every_failed_url { |url| puts url }
104
+ end
105
+
106
+ Finds all pages which have broken links:
107
+
108
+ url_map = Hash.new { |hash,key| hash[key] = [] }
109
+
110
+ spider = Spidr.site('http://intranet.com/') do |spider|
111
+ spider.every_link do |origin,dest|
112
+ url_map[dest] << origin
113
+ end
114
+ end
115
+
116
+ spider.failures.each do |url|
117
+ puts "Broken link #{url} found in:"
118
+
119
+ url_map[url].each { |page| puts " #{page}" }
120
+ end
121
+
122
+ Search HTML and XML pages:
123
+
124
+ Spidr.site('http://company.withablog.com/') do |spider|
125
+ spider.every_page do |page|
126
+ puts "[-] #{page.url}"
127
+
128
+ page.search('//meta').each do |meta|
129
+ name = (meta.attributes['name'] || meta.attributes['http-equiv'])
130
+ value = meta.attributes['content']
131
+
132
+ puts " #{name} = #{value}"
133
+ end
134
+ end
135
+ end
136
+
137
+ Print out the titles from every page:
138
+
139
+ Spidr.site('http://www.rubypulse.com/') do |spider|
140
+ spider.every_html_page do |page|
141
+ puts page.title
142
+ end
143
+ end
144
+
145
+ Find what kinds of web servers a host is using, by accessing the headers:
146
+
147
+ servers = Set[]
148
+
149
+ Spidr.host('generic.company.com') do |spider|
150
+ spider.all_headers do |headers|
151
+ servers << headers['server']
152
+ end
153
+ end
154
+
155
+ Pause the spider on a forbidden page:
156
+
157
+ spider = Spidr.host('overnight.startup.com') do |spider|
158
+ spider.every_forbidden_page do |page|
159
+ spider.pause!
160
+ end
161
+ end
162
+
163
+ Skip the processing of a page:
164
+
165
+ Spidr.host('sketchy.content.com') do |spider|
166
+ spider.every_missing_page do |page|
167
+ spider.skip_page!
168
+ end
169
+ end
170
+
171
+ Skip the processing of links:
172
+
173
+ Spidr.host('sketchy.content.com') do |spider|
174
+ spider.every_url do |url|
175
+ if url.path.split('/').find { |dir| dir.to_i > 1000 }
176
+ spider.skip_link!
177
+ end
178
+ end
179
+ end
180
+
181
+ ## Requirements
182
+
183
+ * [nokogiri](http://nokogiri.rubyforge.org/) ~> 1.3
184
+
185
+ ## Install
186
+
187
+ $ sudo gem install spidr
188
+
189
+ ## License
190
+
191
+ Copyright (c) 2008-2011 Hal Brodigan
192
+
193
+ See {file:LICENSE.txt} for license information.
data/README.md~ ADDED
@@ -0,0 +1,190 @@
1
+ # Spidr
2
+
3
+ * [Homepage](http://spidr.rubyforge.org/)
4
+ * [Source](http://github.com/postmodern/spidr)
5
+ * [Issues](http://github.com/postmodern/spidr/issues)
6
+ * [Mailing List](http://groups.google.com/group/spidr)
7
+ * [IRC](http://webchat.freenode.net/?channels=spidr&uio=d4)
8
+
9
+ ## Description
10
+
11
+ Spidr is a versatile Ruby web spidering library that can spider a site,
12
+ multiple domains, certain links or infinitely. Spidr is designed to be fast
13
+ and easy to use.
14
+
15
+ ## Features
16
+
17
+ * Follows:
18
+ * a tags.
19
+ * iframe tags.
20
+ * frame tags.
21
+ * Cookie protected links.
22
+ * HTTP 300, 301, 302, 303 and 307 Redirects.
23
+ * Meta-Refresh Redirects.
24
+ * HTTP Basic Auth protected links.
25
+ * Black-list or white-list URLs based upon:
26
+ * URL scheme.
27
+ * Host name
28
+ * Port number
29
+ * Full link
30
+ * URL extension
31
+ * Provides call-backs for:
32
+ * Every visited Page.
33
+ * Every visited URL.
34
+ * Every visited URL that matches a specified pattern.
35
+ * Every origin and destination URI of a link.
36
+ * Every URL that failed to be visited.
37
+ * Provides action methods to:
38
+ * Pause spidering.
39
+ * Skip processing of pages.
40
+ * Skip processing of links.
41
+ * Restore the spidering queue and history from a previous session.
42
+ * Custom User-Agent strings.
43
+ * Custom proxy settings.
44
+ * HTTPS support.
45
+
46
+ ## Examples
47
+
48
+ Start spidering from a URL:
49
+
50
+ Spidr.start_at('http://tenderlovemaking.com/')
51
+
52
+ Spider a host:
53
+
54
+ Spidr.host('coderrr.wordpress.com')
55
+
56
+ Spider a site:
57
+
58
+ Spidr.site('http://rubyflow.com/')
59
+
60
+ Spider multiple hosts:
61
+
62
+ Spidr.start_at(
63
+ 'http://company.com/',
64
+ :hosts => [
65
+ 'company.com',
66
+ /host\d\.company\.com/
67
+ ]
68
+ )
69
+
70
+ Do not spider certain links:
71
+
72
+ Spidr.site('http://matasano.com/', :ignore_links => [/log/])
73
+
74
+ Do not spider links on certain ports:
75
+
76
+ Spidr.site(
77
+ 'http://sketchy.content.com/',
78
+ :ignore_ports => [8000, 8010, 8080]
79
+ )
80
+
81
+ Print out visited URLs:
82
+
83
+ Spidr.site('http://rubyinside.org/') do |spider|
84
+ spider.every_url { |url| puts url }
85
+ end
86
+
87
+ Build a URL map of a site:
88
+
89
+ url_map = Hash.new { |hash,key| hash[key] = [] }
90
+
91
+ Spidr.site('http://intranet.com/') do |spider|
92
+ spider.every_link do |origin,dest|
93
+ url_map[dest] << origin
94
+ end
95
+ end
96
+
97
+ Print out the URLs that could not be requested:
98
+
99
+ Spidr.site('http://sketchy.content.com/') do |spider|
100
+ spider.every_failed_url { |url| puts url }
101
+ end
102
+
103
+ Finds all pages which have broken links:
104
+
105
+ url_map = Hash.new { |hash,key| hash[key] = [] }
106
+
107
+ spider = Spidr.site('http://intranet.com/') do |spider|
108
+ spider.every_link do |origin,dest|
109
+ url_map[dest] << origin
110
+ end
111
+ end
112
+
113
+ spider.failures.each do |url|
114
+ puts "Broken link #{url} found in:"
115
+
116
+ url_map[url].each { |page| puts " #{page}" }
117
+ end
118
+
119
+ Search HTML and XML pages:
120
+
121
+ Spidr.site('http://company.withablog.com/') do |spider|
122
+ spider.every_page do |page|
123
+ puts "[-] #{page.url}"
124
+
125
+ page.search('//meta').each do |meta|
126
+ name = (meta.attributes['name'] || meta.attributes['http-equiv'])
127
+ value = meta.attributes['content']
128
+
129
+ puts " #{name} = #{value}"
130
+ end
131
+ end
132
+ end
133
+
134
+ Print out the titles from every page:
135
+
136
+ Spidr.site('http://www.rubypulse.com/') do |spider|
137
+ spider.every_html_page do |page|
138
+ puts page.title
139
+ end
140
+ end
141
+
142
+ Find what kinds of web servers a host is using, by accessing the headers:
143
+
144
+ servers = Set[]
145
+
146
+ Spidr.host('generic.company.com') do |spider|
147
+ spider.all_headers do |headers|
148
+ servers << headers['server']
149
+ end
150
+ end
151
+
152
+ Pause the spider on a forbidden page:
153
+
154
+ spider = Spidr.host('overnight.startup.com') do |spider|
155
+ spider.every_forbidden_page do |page|
156
+ spider.pause!
157
+ end
158
+ end
159
+
160
+ Skip the processing of a page:
161
+
162
+ Spidr.host('sketchy.content.com') do |spider|
163
+ spider.every_missing_page do |page|
164
+ spider.skip_page!
165
+ end
166
+ end
167
+
168
+ Skip the processing of links:
169
+
170
+ Spidr.host('sketchy.content.com') do |spider|
171
+ spider.every_url do |url|
172
+ if url.path.split('/').find { |dir| dir.to_i > 1000 }
173
+ spider.skip_link!
174
+ end
175
+ end
176
+ end
177
+
178
+ ## Requirements
179
+
180
+ * [nokogiri](http://nokogiri.rubyforge.org/) ~> 1.3
181
+
182
+ ## Install
183
+
184
+ $ sudo gem install spidr
185
+
186
+ ## License
187
+
188
+ Copyright (c) 2008-2011 Hal Brodigan
189
+
190
+ See {file:LICENSE.txt} for license information.
data/Rakefile ADDED
@@ -0,0 +1,29 @@
1
+ require 'rubygems'
2
+
3
+ begin
4
+ require 'bundler'
5
+ rescue LoadError => e
6
+ STDERR.puts e.message
7
+ STDERR.puts "Run `gem install bundler` to install Bundler."
8
+ exit e.status_code
9
+ end
10
+
11
+ begin
12
+ Bundler.setup(:development)
13
+ rescue Bundler::BundlerError => e
14
+ STDERR.puts e.message
15
+ STDERR.puts "Run `bundle install` to install missing gems"
16
+ exit e.status_code
17
+ end
18
+
19
+ require 'rake'
20
+
21
+ require 'rubygems/tasks'
22
+ Gem::Tasks.new
23
+
24
+ require 'rspec/core/rake_task'
25
+ RSpec::Core::RakeTask.new
26
+ task :default => :spec
27
+
28
+ require 'yard'
29
+ YARD::Rake::YardocTask.new
data/gemspec.yml ADDED
@@ -0,0 +1,19 @@
1
+ name: spidr
2
+ summary: A versatile Ruby web spidering library
3
+ description:
4
+ Spidr is a versatile Ruby web spidering library that can spider a site,
5
+ multiple domains, certain links or infinitely. Spidr is designed to be
6
+ fast and easy to use.
7
+
8
+ license: MIT
9
+ authors: Postmodern
10
+ email: postmodern.mod3@gmail.com
11
+ homepage: http://github.com/postmodern/spidr
12
+ has_yard: true
13
+
14
+ dependencies:
15
+ nokogiri: ~> 1.3
16
+
17
+ development_dependencies:
18
+ bundler: ~> 1.0
19
+ yard: ~> 0.7
@@ -0,0 +1,83 @@
1
+ require 'spidr/actions/exceptions/paused'
2
+ require 'spidr/actions/exceptions/skip_link'
3
+ require 'spidr/actions/exceptions/skip_page'
4
+
5
+ module Spidr
6
+ #
7
+ # The {Actions} module adds methods to {Agent} for controlling the
8
+ # spidering of links.
9
+ #
10
+ module Actions
11
+ #
12
+ # Continue spidering.
13
+ #
14
+ # @yield [page]
15
+ # If a block is given, it will be passed every page visited.
16
+ #
17
+ # @yieldparam [Page] page
18
+ # The page to be visited.
19
+ #
20
+ def continue!(&block)
21
+ @paused = false
22
+ return run(&block)
23
+ end
24
+
25
+ #
26
+ # Sets the pause state of the agent.
27
+ #
28
+ # @param [Boolean] state
29
+ # The new pause state of the agent.
30
+ #
31
+ def pause=(state)
32
+ @paused = state
33
+ end
34
+
35
+ #
36
+ # Pauses the agent, causing spidering to temporarily stop.
37
+ #
38
+ # @raise [Paused]
39
+ # Indicates to the agent, that it should pause spidering.
40
+ #
41
+ def pause!
42
+ @paused = true
43
+ raise(Paused)
44
+ end
45
+
46
+ #
47
+ # Determines whether the agent is paused.
48
+ #
49
+ # @return [Boolean]
50
+ # Specifies whether the agent is paused.
51
+ #
52
+ def paused?
53
+ @paused == true
54
+ end
55
+
56
+ #
57
+ # Causes the agent to skip the link being enqueued.
58
+ #
59
+ # @raise [SkipLink]
60
+ # Indicates to the agent, that the current link should be skipped,
61
+ # and not enqueued or visited.
62
+ #
63
+ def skip_link!
64
+ raise(SkipLink)
65
+ end
66
+
67
+ #
68
+ # Causes the agent to skip the page being visited.
69
+ #
70
+ # @raise [SkipPage]
71
+ # Indicates to the agent, that the current page should be skipped.
72
+ #
73
+ def skip_page!
74
+ raise(SkipPage)
75
+ end
76
+
77
+ protected
78
+
79
+ def initialize_actions(options={})
80
+ @paused = false
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,9 @@
1
+ module Spidr
2
+ module Actions
3
+ #
4
+ # The base {Actions} exception class.
5
+ #
6
+ class Action < RuntimeError
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,11 @@
1
+ require 'spidr/actions/exceptions/action'
2
+
3
+ module Spidr
4
+ module Actions
5
+ #
6
+ # An {Actions} exception class used to pause a running {Agent}.
7
+ #
8
+ class Paused < Action
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,12 @@
1
+ require 'spidr/actions/exceptions/action'
2
+
3
+ module Spidr
4
+ module Actions
5
+ #
6
+ # An {Actions} exception class which causes a running {Agent} to
7
+ # skip a link.
8
+ #
9
+ class SkipLink < Action
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'spidr/actions/exceptions/action'
2
+
3
+ module Spidr
4
+ module Actions
5
+ #
6
+ # An {Actions} exception class which causes a running {Agent} to
7
+ # skip a {Page}, and all links within that page.
8
+ #
9
+ class SkipPage < Action
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,4 @@
1
+ require 'spidr/actions/exceptions/action'
2
+ require 'spidr/actions/exceptions/paused'
3
+ require 'spidr/actions/exceptions/skip_link'
4
+ require 'spidr/actions/exceptions/skip_page'
@@ -0,0 +1,2 @@
1
+ require 'spidrs/actions/exceptions'
2
+ require 'spidrs/actions/actions'