outlander 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/lib/outlander.rb +1 -2
- data/lib/outlander/agent.rb +0 -32
- data/lib/outlander/crawler.rb +38 -18
- data/lib/outlander/threads_pool.rb +1 -1
- data/lib/outlander/version.rb +1 -1
- data/outlander.gemspec +0 -1
- metadata +3 -20
- data/.DS_Store +0 -0
- data/lib/.DS_Store +0 -0
- data/lib/outlander/.DS_Store +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 93eeac684cf867eacf0e54ee6d1967769e2a1c8e
|
4
|
+
data.tar.gz: 42d585420132fbdd11a665d9f89990e8ab3adfd6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 88c97f88ec619bb42df78940234628d45cc2722b8a11257d02a441d99a0829a715bc5fbb1f56bf4f014275f07fc1b4daa8c426b02de9dce8fa5e7a03c8d72789
|
7
|
+
data.tar.gz: dbdfaa58aab3e29b194892204bc5b9b45dd35355af580e72d9dee2fc93a7f129ab42d451d174d929c044acc1786a5d3bade250fab5684b6aa64a1650daa180af
|
data/.gitignore
CHANGED
data/lib/outlander.rb
CHANGED
data/lib/outlander/agent.rb
CHANGED
@@ -5,37 +5,5 @@ module Outlander
|
|
5
5
|
class Agent
|
6
6
|
|
7
7
|
include HTTParty
|
8
|
-
|
9
|
-
class << self
|
10
|
-
attr_accessor :cache_storage
|
11
|
-
end
|
12
|
-
|
13
|
-
def self.fetch(path, options = {}, &block)
|
14
|
-
if cache_enabled? && options[:cache]
|
15
|
-
response_body = @cache_storage.get(path) || get(path, options, &block).body
|
16
|
-
@cache_storage.set(path, response_body)
|
17
|
-
response_body
|
18
|
-
else
|
19
|
-
get(path, options, &block).body
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def self.method_missing(m, *args, &block)
|
24
|
-
if m.to_s.end_with? '_with_cache'
|
25
|
-
if cache_enabled?
|
26
|
-
response_body = @cache_storage.get(args[0]) || get(*args, &block).body
|
27
|
-
@cache_storage.set(args[0], response_body)
|
28
|
-
response_body
|
29
|
-
else
|
30
|
-
get(*args, &block).body
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
private
|
36
|
-
|
37
|
-
def self.cache_enabled?
|
38
|
-
!!@cache_storage
|
39
|
-
end
|
40
8
|
end
|
41
9
|
end
|
data/lib/outlander/crawler.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'nokogiri'
|
2
|
-
require '
|
2
|
+
require 'thread'
|
3
3
|
require 'outlander/agent'
|
4
4
|
require 'outlander/threads_pool'
|
5
5
|
|
@@ -8,7 +8,8 @@ module Outlander
|
|
8
8
|
module Crawler
|
9
9
|
|
10
10
|
DEFAULT_OPTIONS = {
|
11
|
-
num_threads: 3
|
11
|
+
num_threads: 3,
|
12
|
+
pause: 1
|
12
13
|
}
|
13
14
|
|
14
15
|
class << self
|
@@ -24,9 +25,9 @@ module Outlander
|
|
24
25
|
|
25
26
|
module ClassMethods
|
26
27
|
|
27
|
-
attr_reader :roots, :
|
28
|
+
attr_reader :roots, :handlers, :setup
|
28
29
|
|
29
|
-
def
|
30
|
+
def entry_point(url, handler = :process_root)
|
30
31
|
@roots[url] = handler
|
31
32
|
end
|
32
33
|
|
@@ -44,13 +45,13 @@ module Outlander
|
|
44
45
|
end
|
45
46
|
|
46
47
|
def initialize(options = {})
|
47
|
-
|
48
|
-
@
|
49
|
-
@options = options.merge DEFAULT_OPTIONS
|
48
|
+
@cache = options.delete(:cache)
|
49
|
+
@options = DEFAULT_OPTIONS.merge options
|
50
50
|
@history = {}
|
51
|
+
@mutex = Mutex.new
|
51
52
|
@pool = ThreadsPool.new @options[:num_threads]
|
52
53
|
self.class.roots.each do |url, handler|
|
53
|
-
|
54
|
+
discover url, handler
|
54
55
|
end
|
55
56
|
end
|
56
57
|
|
@@ -62,27 +63,46 @@ module Outlander
|
|
62
63
|
|
63
64
|
private
|
64
65
|
|
65
|
-
def record(
|
66
|
-
@result_handler.call
|
66
|
+
def record(*args)
|
67
|
+
@result_handler.call *args
|
67
68
|
end
|
68
69
|
|
69
|
-
def
|
70
|
-
|
70
|
+
def discover(url, handler, *args)
|
71
|
+
@mutex.synchronize {
|
72
|
+
return if @history[url] == handler
|
73
|
+
@history[url] = handler
|
74
|
+
}
|
71
75
|
@pool.enqueue do
|
72
76
|
begin
|
73
|
-
body =
|
77
|
+
body = fetch url do
|
78
|
+
sleep rand(@options[:pause])
|
79
|
+
@agent.get(url).body
|
80
|
+
end
|
74
81
|
instance_exec Nokogiri::HTML(body), *args, &self.class.handlers[handler.to_sym]
|
75
82
|
rescue => e
|
76
|
-
|
77
|
-
else
|
78
|
-
@logger.info "Processed #{url} with ##{handler}"
|
83
|
+
puts "[ERROR] Failed to process #{url} with ##{handler} #{e.inspect}"
|
79
84
|
end
|
80
85
|
end
|
81
|
-
@logger.info "Enqueued #{url} for ##{handler}"
|
82
86
|
end
|
83
87
|
|
84
88
|
def agent
|
85
|
-
@agent ||= Agent
|
89
|
+
@agent ||= Class.new(Agent)
|
90
|
+
end
|
91
|
+
|
92
|
+
def fetch(url, &block)
|
93
|
+
if cache_enabled?
|
94
|
+
return @cache.get url if @cache.exists url
|
95
|
+
|
96
|
+
value = block.call
|
97
|
+
@cache.set url, value
|
98
|
+
value
|
99
|
+
else
|
100
|
+
block.call
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def cache_enabled?
|
105
|
+
!!@cache
|
86
106
|
end
|
87
107
|
end
|
88
108
|
end
|
data/lib/outlander/version.rb
CHANGED
data/outlander.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: outlander
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Terry Progetto
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-11-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,20 +66,6 @@ dependencies:
|
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: dalli
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ">="
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
76
|
-
type: :runtime
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - ">="
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
83
69
|
description: Multithreaded web crawler with transparent DSL and requests caching.
|
84
70
|
email:
|
85
71
|
- terryprogetto@gmail.com
|
@@ -87,16 +73,13 @@ executables: []
|
|
87
73
|
extensions: []
|
88
74
|
extra_rdoc_files: []
|
89
75
|
files:
|
90
|
-
- ".DS_Store"
|
91
76
|
- ".gitignore"
|
92
77
|
- Gemfile
|
93
78
|
- README.md
|
94
79
|
- Rakefile
|
95
80
|
- bin/console
|
96
81
|
- bin/setup
|
97
|
-
- lib/.DS_Store
|
98
82
|
- lib/outlander.rb
|
99
|
-
- lib/outlander/.DS_Store
|
100
83
|
- lib/outlander/agent.rb
|
101
84
|
- lib/outlander/crawler.rb
|
102
85
|
- lib/outlander/threads_pool.rb
|
@@ -121,7 +104,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
121
104
|
version: '0'
|
122
105
|
requirements: []
|
123
106
|
rubyforge_project:
|
124
|
-
rubygems_version: 2.6.
|
107
|
+
rubygems_version: 2.6.13
|
125
108
|
signing_key:
|
126
109
|
specification_version: 4
|
127
110
|
summary: Web pages crawler.
|
data/.DS_Store
DELETED
Binary file
|
data/lib/.DS_Store
DELETED
Binary file
|
data/lib/outlander/.DS_Store
DELETED
Binary file
|