spidr_epg_gem 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/lib/spidr.rb +3 -0
- data/lib/spidr_epg/actions/actions.rb +83 -0
- data/lib/spidr_epg/actions/exceptions/action.rb +9 -0
- data/lib/spidr_epg/actions/exceptions/paused.rb +11 -0
- data/lib/spidr_epg/actions/exceptions/skip_link.rb +12 -0
- data/lib/spidr_epg/actions/exceptions/skip_page.rb +12 -0
- data/lib/spidr_epg/actions/exceptions.rb +4 -0
- data/lib/spidr_epg/actions.rb +2 -0
- data/lib/spidr_epg/agent.rb +747 -0
- data/lib/spidr_epg/auth_credential.rb +28 -0
- data/lib/spidr_epg/auth_store.rb +161 -0
- data/lib/spidr_epg/body.rb +98 -0
- data/lib/spidr_epg/cookie_jar.rb +202 -0
- data/lib/spidr_epg/events.rb +537 -0
- data/lib/spidr_epg/extensions/uri.rb +52 -0
- data/lib/spidr_epg/extensions.rb +1 -0
- data/lib/spidr_epg/filters.rb +539 -0
- data/lib/spidr_epg/headers.rb +370 -0
- data/lib/spidr_epg/links.rb +229 -0
- data/lib/spidr_epg/page.rb +108 -0
- data/lib/spidr_epg/rules.rb +79 -0
- data/lib/spidr_epg/sanitizers.rb +56 -0
- data/lib/spidr_epg/session_cache.rb +145 -0
- data/lib/spidr_epg/spidr.rb +98 -0
- data/lib/spidr_epg/version.rb +4 -0
- data/lib/spidr_epg.rb +3 -0
- data/lib/spidr_epg_gem.rb~ +3 -0
- data/lib/spidr_epg_gem~ +7 -0
- data/spidr_epg_gem.gemspec +17 -0
- metadata +72 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
module Spidr
|
2
|
+
#
|
3
|
+
# The {Rules} class represents collections of acceptance and rejection
|
4
|
+
# rules, which are used to filter data.
|
5
|
+
#
|
6
|
+
class Rules
|
7
|
+
|
8
|
+
# Accept rules
|
9
|
+
attr_reader :accept
|
10
|
+
|
11
|
+
# Reject rules
|
12
|
+
attr_reader :reject
|
13
|
+
|
14
|
+
#
|
15
|
+
# Creates a new Rules object.
|
16
|
+
#
|
17
|
+
# @param [Hash] options
|
18
|
+
# Additional options.
|
19
|
+
#
|
20
|
+
# @option options [Array<String, Regexp, Proc>] :accept
|
21
|
+
# The patterns to accept data with.
|
22
|
+
#
|
23
|
+
# @option options [Array<String, Regexp, Proc>] :reject
|
24
|
+
# The patterns to reject data with.
|
25
|
+
#
|
26
|
+
def initialize(options={})
|
27
|
+
@accept = []
|
28
|
+
@reject = []
|
29
|
+
|
30
|
+
@accept += options[:accept] if options[:accept]
|
31
|
+
@reject += options[:reject] if options[:reject]
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# Determines whether the data should be accepted or rejected.
|
36
|
+
#
|
37
|
+
# @return [Boolean]
|
38
|
+
# Specifies whether the given data was accepted, using the rules
|
39
|
+
# acceptance patterns.
|
40
|
+
#
|
41
|
+
def accept?(data)
|
42
|
+
unless @accept.empty?
|
43
|
+
@accept.any? { |rule| test_data(data,rule) }
|
44
|
+
else
|
45
|
+
!@reject.any? { |rule| test_data(data,rule) }
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
#
|
50
|
+
# Determines whether the data should be rejected or accepted.
|
51
|
+
#
|
52
|
+
# @return [Boolean]
|
53
|
+
# Specifies whether the given data was rejected, using the rules
|
54
|
+
# rejection patterns.
|
55
|
+
#
|
56
|
+
def reject?(data)
|
57
|
+
!accept?(data)
|
58
|
+
end
|
59
|
+
|
60
|
+
protected
|
61
|
+
|
62
|
+
#
|
63
|
+
# Tests the given data against a given pattern.
|
64
|
+
#
|
65
|
+
# @return [Boolean]
|
66
|
+
# Specifies whether the given data matched the pattern.
|
67
|
+
#
|
68
|
+
def test_data(data,rule)
|
69
|
+
if rule.kind_of?(Proc)
|
70
|
+
rule.call(data) == true
|
71
|
+
elsif rule.kind_of?(Regexp)
|
72
|
+
!((data.to_s =~ rule).nil?)
|
73
|
+
else
|
74
|
+
data == rule
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
module Spidr
|
4
|
+
#
|
5
|
+
# The {Sanitizers} module adds methods to {Agent} which control the
|
6
|
+
# sanitation of incoming links.
|
7
|
+
#
|
8
|
+
module Sanitizers
|
9
|
+
# Specifies whether the Agent will strip URI fragments
|
10
|
+
attr_accessor :strip_fragments
|
11
|
+
|
12
|
+
# Specifies whether the Agent will strip URI queries
|
13
|
+
attr_accessor :strip_query
|
14
|
+
|
15
|
+
#
|
16
|
+
# Sanitizes a URL based on filtering options.
|
17
|
+
#
|
18
|
+
# @param [URI::HTTP, URI::HTTPS, String] url
|
19
|
+
# The URL to be sanitized
|
20
|
+
#
|
21
|
+
# @return [URI::HTTP, URI::HTTPS]
|
22
|
+
# The new sanitized URL.
|
23
|
+
#
|
24
|
+
# @since 0.2.2
|
25
|
+
#
|
26
|
+
def sanitize_url(url)
|
27
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
28
|
+
|
29
|
+
url.fragment = nil if @strip_fragments
|
30
|
+
url.query = nil if @strip_query
|
31
|
+
|
32
|
+
return url
|
33
|
+
end
|
34
|
+
|
35
|
+
protected
|
36
|
+
|
37
|
+
#
|
38
|
+
# Initializes the Sanitizer rules.
|
39
|
+
#
|
40
|
+
# @param [Hash] options
|
41
|
+
# Additional options.
|
42
|
+
#
|
43
|
+
# @option options [Boolean] :strip_fragments (true)
|
44
|
+
# Specifies whether or not to strip the fragment component from URLs.
|
45
|
+
#
|
46
|
+
# @option options [Boolean] :strip_query (false)
|
47
|
+
# Specifies whether or not to strip the query component from URLs.
|
48
|
+
#
|
49
|
+
# @since 0.2.2
|
50
|
+
#
|
51
|
+
def initialize_sanitizers(options={})
|
52
|
+
@strip_fragments = options.fetch(:strip_fragments,true)
|
53
|
+
@strip_query = options.fetch(:strip_query,false)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,145 @@
|
|
1
|
+
require 'spidr/spidr'
|
2
|
+
|
3
|
+
require 'net/http'
|
4
|
+
|
5
|
+
module Spidr
|
6
|
+
#
|
7
|
+
# Stores active HTTP Sessions organized by scheme, host-name and port.
|
8
|
+
#
|
9
|
+
class SessionCache
|
10
|
+
|
11
|
+
# Proxy to use
|
12
|
+
attr_accessor :proxy
|
13
|
+
|
14
|
+
#
|
15
|
+
# Creates a new session cache.
|
16
|
+
#
|
17
|
+
# @param [Hash] proxy (Spidr.proxy)
|
18
|
+
# Proxy options.
|
19
|
+
#
|
20
|
+
# @option proxy [String] :host
|
21
|
+
# The host the proxy is running on.
|
22
|
+
#
|
23
|
+
# @option proxy [Integer] :port
|
24
|
+
# The port the proxy is running on.
|
25
|
+
#
|
26
|
+
# @option proxy [String] :user
|
27
|
+
# The user to authenticate as with the proxy.
|
28
|
+
#
|
29
|
+
# @option proxy [String] :password
|
30
|
+
# The password to authenticate with.
|
31
|
+
#
|
32
|
+
# @since 0.2.2
|
33
|
+
#
|
34
|
+
def initialize(proxy=Spidr.proxy)
|
35
|
+
@proxy = proxy
|
36
|
+
@sessions = {}
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Determines if there is an active HTTP session for a given URL.
|
41
|
+
#
|
42
|
+
# @param [URI::HTTP, String] url
|
43
|
+
# The URL that represents a session.
|
44
|
+
#
|
45
|
+
# @return [Boolean]
|
46
|
+
# Specifies whether there is an active HTTP session.
|
47
|
+
#
|
48
|
+
# @since 0.2.3
|
49
|
+
#
|
50
|
+
def active?(url)
|
51
|
+
# normalize the url
|
52
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
53
|
+
|
54
|
+
# session key
|
55
|
+
key = [url.scheme, url.host, url.port]
|
56
|
+
|
57
|
+
return @sessions.has_key?(key)
|
58
|
+
end
|
59
|
+
|
60
|
+
#
|
61
|
+
# Provides an active HTTP session for a given URL.
|
62
|
+
#
|
63
|
+
# @param [URI::HTTP, String] url
|
64
|
+
# The URL which will be requested later.
|
65
|
+
#
|
66
|
+
# @return [Net::HTTP]
|
67
|
+
# The active HTTP session object.
|
68
|
+
#
|
69
|
+
def [](url)
|
70
|
+
# normalize the url
|
71
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
72
|
+
|
73
|
+
# session key
|
74
|
+
key = [url.scheme, url.host, url.port]
|
75
|
+
|
76
|
+
unless @sessions[key]
|
77
|
+
session = Net::HTTP::Proxy(
|
78
|
+
@proxy[:host],
|
79
|
+
@proxy[:port],
|
80
|
+
@proxy[:user],
|
81
|
+
@proxy[:password]
|
82
|
+
).new(url.host,url.port)
|
83
|
+
|
84
|
+
if url.scheme == 'https'
|
85
|
+
session.use_ssl = true
|
86
|
+
session.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
87
|
+
session.start
|
88
|
+
end
|
89
|
+
|
90
|
+
@sessions[key] = session
|
91
|
+
end
|
92
|
+
|
93
|
+
return @sessions[key]
|
94
|
+
end
|
95
|
+
|
96
|
+
#
|
97
|
+
# Destroys an HTTP session for the given scheme, host and port.
|
98
|
+
#
|
99
|
+
# @param [URI::HTTP, String] url
|
100
|
+
# The URL of the requested session.
|
101
|
+
#
|
102
|
+
# @return [nil]
|
103
|
+
#
|
104
|
+
# @since 0.2.2
|
105
|
+
#
|
106
|
+
def kill!(url)
|
107
|
+
# normalize the url
|
108
|
+
url = URI(url.to_s) unless url.kind_of?(URI)
|
109
|
+
|
110
|
+
# session key
|
111
|
+
key = [url.scheme, url.host, url.port]
|
112
|
+
|
113
|
+
if (sess = @sessions[key])
|
114
|
+
begin
|
115
|
+
sess.finish
|
116
|
+
rescue IOError
|
117
|
+
end
|
118
|
+
|
119
|
+
@sessions.delete(key)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
#
|
124
|
+
# Clears the session cache.
|
125
|
+
#
|
126
|
+
# @return [SessionCache]
|
127
|
+
# The cleared session cache.
|
128
|
+
#
|
129
|
+
# @since 0.2.2
|
130
|
+
#
|
131
|
+
def clear
|
132
|
+
@sessions.each_value do |sess|
|
133
|
+
begin
|
134
|
+
sess.finish
|
135
|
+
rescue IOError
|
136
|
+
nil
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
@sessions.clear
|
141
|
+
return self
|
142
|
+
end
|
143
|
+
|
144
|
+
end
|
145
|
+
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'spidr/agent'
|
2
|
+
|
3
|
+
module Spidr
|
4
|
+
# Common proxy port.
|
5
|
+
COMMON_PROXY_PORT = 8080
|
6
|
+
|
7
|
+
# Default proxy information.
|
8
|
+
DEFAULT_PROXY = {
|
9
|
+
:host => nil,
|
10
|
+
:port => COMMON_PROXY_PORT,
|
11
|
+
:user => nil,
|
12
|
+
:password => nil
|
13
|
+
}
|
14
|
+
|
15
|
+
#
|
16
|
+
# Proxy information used by all newly created Agent objects by default.
|
17
|
+
#
|
18
|
+
# @return [Hash]
|
19
|
+
# The Spidr proxy information.
|
20
|
+
#
|
21
|
+
def Spidr.proxy
|
22
|
+
@@spidr_proxy ||= DEFAULT_PROXY
|
23
|
+
end
|
24
|
+
|
25
|
+
#
|
26
|
+
# Sets the proxy information used by Agent objects.
|
27
|
+
#
|
28
|
+
# @param [Hash] new_proxy
|
29
|
+
# The new proxy information.
|
30
|
+
#
|
31
|
+
# @option new_proxy [String] :host
|
32
|
+
# The host-name of the proxy.
|
33
|
+
#
|
34
|
+
# @option new_proxy [Integer] :port (COMMON_PROXY_PORT)
|
35
|
+
# The port of the proxy.
|
36
|
+
#
|
37
|
+
# @option new_proxy [String] :user
|
38
|
+
# The user to authenticate with the proxy as.
|
39
|
+
#
|
40
|
+
# @option new_proxy [String] :password
|
41
|
+
# The password to authenticate with the proxy.
|
42
|
+
#
|
43
|
+
# @return [Hash]
|
44
|
+
# The new proxy information.
|
45
|
+
#
|
46
|
+
def Spidr.proxy=(new_proxy)
|
47
|
+
@@spidr_proxy = {:port => COMMON_PROXY_PORT}.merge(new_proxy)
|
48
|
+
end
|
49
|
+
|
50
|
+
#
|
51
|
+
# Disables the proxy settings used by all newly created Agent objects.
|
52
|
+
#
|
53
|
+
def Spidr.disable_proxy!
|
54
|
+
@@spidr_proxy = DEFAULT_PROXY
|
55
|
+
return true
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# The User-Agent string used by all Agent objects by default.
|
60
|
+
#
|
61
|
+
# @return [String]
|
62
|
+
# The Spidr User-Agent string.
|
63
|
+
#
|
64
|
+
def Spidr.user_agent
|
65
|
+
@@spidr_user_agent ||= nil
|
66
|
+
end
|
67
|
+
|
68
|
+
#
|
69
|
+
# Sets the Spidr User-Agent string.
|
70
|
+
#
|
71
|
+
# @param [String] new_agent
|
72
|
+
# The new User-Agent string.
|
73
|
+
#
|
74
|
+
def Spidr.user_agent=(new_agent)
|
75
|
+
@@spidr_user_agent = new_agent
|
76
|
+
end
|
77
|
+
|
78
|
+
#
|
79
|
+
# @see Agent.start_at
|
80
|
+
#
|
81
|
+
def Spidr.start_at(url,options={},&block)
|
82
|
+
Agent.start_at(url,options,&block)
|
83
|
+
end
|
84
|
+
|
85
|
+
#
|
86
|
+
# @see Agent.host
|
87
|
+
#
|
88
|
+
def Spidr.host(name,options={},&block)
|
89
|
+
Agent.host(name,options,&block)
|
90
|
+
end
|
91
|
+
|
92
|
+
#
|
93
|
+
# @see Agent.site
|
94
|
+
#
|
95
|
+
def Spidr.site(url,options={},&block)
|
96
|
+
Agent.site(url,options,&block)
|
97
|
+
end
|
98
|
+
end
|
data/lib/spidr_epg.rb
ADDED
data/lib/spidr_epg_gem~
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.platform = Gem::Platform::RUBY
|
3
|
+
s.name = 'spidr_epg_gem'
|
4
|
+
s.version = '0.0.0'
|
5
|
+
s.date = %q{2013-04-15}
|
6
|
+
s.summary = 'Use for crwaling EPG'
|
7
|
+
s.description = 'Use for crwaling EPG'
|
8
|
+
s.required_ruby_version = '>= 1.9.3'
|
9
|
+
|
10
|
+
s.authors = ["zql"]
|
11
|
+
s.email = ''
|
12
|
+
|
13
|
+
s.files = `git ls-files`.split("\n")
|
14
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
15
|
+
s.require_path = 'lib'
|
16
|
+
s.requirements << 'none'
|
17
|
+
end
|
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: spidr_epg_gem
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- zql
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-04-15 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Use for crwaling EPG
|
14
|
+
email: ''
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/spidr.rb
|
20
|
+
- lib/spidr_epg.rb
|
21
|
+
- lib/spidr_epg/actions.rb
|
22
|
+
- lib/spidr_epg/actions/actions.rb
|
23
|
+
- lib/spidr_epg/actions/exceptions.rb
|
24
|
+
- lib/spidr_epg/actions/exceptions/action.rb
|
25
|
+
- lib/spidr_epg/actions/exceptions/paused.rb
|
26
|
+
- lib/spidr_epg/actions/exceptions/skip_link.rb
|
27
|
+
- lib/spidr_epg/actions/exceptions/skip_page.rb
|
28
|
+
- lib/spidr_epg/agent.rb
|
29
|
+
- lib/spidr_epg/auth_credential.rb
|
30
|
+
- lib/spidr_epg/auth_store.rb
|
31
|
+
- lib/spidr_epg/body.rb
|
32
|
+
- lib/spidr_epg/cookie_jar.rb
|
33
|
+
- lib/spidr_epg/events.rb
|
34
|
+
- lib/spidr_epg/extensions.rb
|
35
|
+
- lib/spidr_epg/extensions/uri.rb
|
36
|
+
- lib/spidr_epg/filters.rb
|
37
|
+
- lib/spidr_epg/headers.rb
|
38
|
+
- lib/spidr_epg/links.rb
|
39
|
+
- lib/spidr_epg/page.rb
|
40
|
+
- lib/spidr_epg/rules.rb
|
41
|
+
- lib/spidr_epg/sanitizers.rb
|
42
|
+
- lib/spidr_epg/session_cache.rb
|
43
|
+
- lib/spidr_epg/spidr.rb
|
44
|
+
- lib/spidr_epg/version.rb
|
45
|
+
- lib/spidr_epg_gem.rb~
|
46
|
+
- lib/spidr_epg_gem~
|
47
|
+
- spidr_epg_gem.gemspec
|
48
|
+
homepage:
|
49
|
+
licenses: []
|
50
|
+
metadata: {}
|
51
|
+
post_install_message:
|
52
|
+
rdoc_options: []
|
53
|
+
require_paths:
|
54
|
+
- lib
|
55
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - ! '>='
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: 1.9.3
|
60
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
61
|
+
requirements:
|
62
|
+
- - ! '>='
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
version: '0'
|
65
|
+
requirements:
|
66
|
+
- none
|
67
|
+
rubyforge_project:
|
68
|
+
rubygems_version: 2.0.3
|
69
|
+
signing_key:
|
70
|
+
specification_version: 4
|
71
|
+
summary: Use for crwaling EPG
|
72
|
+
test_files: []
|