outlander 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6b6951e27ebb022fbc2ec65a1b7394e84d876868
4
- data.tar.gz: d77d4077c1cbdca9a6c09980c12016a9639b05d4
3
+ metadata.gz: 93eeac684cf867eacf0e54ee6d1967769e2a1c8e
4
+ data.tar.gz: 42d585420132fbdd11a665d9f89990e8ab3adfd6
5
5
  SHA512:
6
- metadata.gz: 0a4ceb2e8a3e564db5ec088855428711e66a63e903f35c58ed18f127cea3eebea2fa7af673b872494dceffdbc54c7672f21dc78a8f5ff276d9df4fe5106ce2e4
7
- data.tar.gz: 2da3b6e9e6939c0032004ef4fa71fea7d7ed2f65f6c20907bbfa318a7864beee8195c76821023f588669d8b1f355d8b3944c76d126584e3d1fe0f12caa010f27
6
+ metadata.gz: 88c97f88ec619bb42df78940234628d45cc2722b8a11257d02a441d99a0829a715bc5fbb1f56bf4f014275f07fc1b4daa8c426b02de9dce8fa5e7a03c8d72789
7
+ data.tar.gz: dbdfaa58aab3e29b194892204bc5b9b45dd35355af580e72d9dee2fc93a7f129ab42d451d174d929c044acc1786a5d3bade250fab5684b6aa64a1650daa180af
data/.gitignore CHANGED
@@ -7,3 +7,4 @@
7
7
  /pkg/
8
8
  /spec/reports/
9
9
  /tmp/
10
+ .DS_Store
data/lib/outlander.rb CHANGED
@@ -1,3 +1,2 @@
1
- require "outlander/version"
2
-
1
+ require 'outlander/version'
3
2
  require 'outlander/crawler'
@@ -5,37 +5,5 @@ module Outlander
5
5
  class Agent
6
6
 
7
7
  include HTTParty
8
-
9
- class << self
10
- attr_accessor :cache_storage
11
- end
12
-
13
- def self.fetch(path, options = {}, &block)
14
- if cache_enabled? && options[:cache]
15
- response_body = @cache_storage.get(path) || get(path, options, &block).body
16
- @cache_storage.set(path, response_body)
17
- response_body
18
- else
19
- get(path, options, &block).body
20
- end
21
- end
22
-
23
- def self.method_missing(m, *args, &block)
24
- if m.to_s.end_with? '_with_cache'
25
- if cache_enabled?
26
- response_body = @cache_storage.get(args[0]) || get(*args, &block).body
27
- @cache_storage.set(args[0], response_body)
28
- response_body
29
- else
30
- get(*args, &block).body
31
- end
32
- end
33
- end
34
-
35
- private
36
-
37
- def self.cache_enabled?
38
- !!@cache_storage
39
- end
40
8
  end
41
9
  end
@@ -1,5 +1,5 @@
1
1
  require 'nokogiri'
2
- require 'logger'
2
+ require 'thread'
3
3
  require 'outlander/agent'
4
4
  require 'outlander/threads_pool'
5
5
 
@@ -8,7 +8,8 @@ module Outlander
8
8
  module Crawler
9
9
 
10
10
  DEFAULT_OPTIONS = {
11
- num_threads: 3
11
+ num_threads: 3,
12
+ pause: 1
12
13
  }
13
14
 
14
15
  class << self
@@ -24,9 +25,9 @@ module Outlander
24
25
 
25
26
  module ClassMethods
26
27
 
27
- attr_reader :roots, :setup, :handlers
28
+ attr_reader :roots, :handlers, :setup
28
29
 
29
- def entrypoint(url, handler = :process_root)
30
+ def entry_point(url, handler = :process_root)
30
31
  @roots[url] = handler
31
32
  end
32
33
 
@@ -44,13 +45,13 @@ module Outlander
44
45
  end
45
46
 
46
47
  def initialize(options = {})
47
- agent.cache_storage = options.delete(:cache_storage)
48
- @logger = Logger.new(options.fetch(:log_to, STDOUT))
49
- @options = options.merge DEFAULT_OPTIONS
48
+ @cache = options.delete(:cache)
49
+ @options = DEFAULT_OPTIONS.merge options
50
50
  @history = {}
51
+ @mutex = Mutex.new
51
52
  @pool = ThreadsPool.new @options[:num_threads]
52
53
  self.class.roots.each do |url, handler|
53
- enqueue url, handler
54
+ discover url, handler
54
55
  end
55
56
  end
56
57
 
@@ -62,27 +63,46 @@ module Outlander
62
63
 
63
64
  private
64
65
 
65
- def record(data)
66
- @result_handler.call data
66
+ def record(*args)
67
+ @result_handler.call *args
67
68
  end
68
69
 
69
- def enqueue(url, handler, *args)
70
- return if @history[url] == handler
70
+ def discover(url, handler, *args)
71
+ @mutex.synchronize {
72
+ return if @history[url] == handler
73
+ @history[url] = handler
74
+ }
71
75
  @pool.enqueue do
72
76
  begin
73
- body = agent.get_with_cache(url)
77
+ body = fetch url do
78
+ sleep rand(@options[:pause])
79
+ @agent.get(url).body
80
+ end
74
81
  instance_exec Nokogiri::HTML(body), *args, &self.class.handlers[handler.to_sym]
75
82
  rescue => e
76
- @logger.error "Failed to process #{url} with ##{handler} #{e.inspect}"
77
- else
78
- @logger.info "Processed #{url} with ##{handler}"
83
+ puts "[ERROR] Failed to process #{url} with ##{handler} #{e.inspect}"
79
84
  end
80
85
  end
81
- @logger.info "Enqueued #{url} for ##{handler}"
82
86
  end
83
87
 
84
88
  def agent
85
- @agent ||= Agent.dup
89
+ @agent ||= Class.new(Agent)
90
+ end
91
+
92
+ def fetch(url, &block)
93
+ if cache_enabled?
94
+ return @cache.get url if @cache.exists url
95
+
96
+ value = block.call
97
+ @cache.set url, value
98
+ value
99
+ else
100
+ block.call
101
+ end
102
+ end
103
+
104
+ def cache_enabled?
105
+ !!@cache
86
106
  end
87
107
  end
88
108
  end
@@ -15,7 +15,7 @@ module Outlander
15
15
 
16
16
  def start
17
17
  raise "Could not start with empty queue" if @queue.empty?
18
- run_threads(@num_threads)
18
+ run_threads @num_threads
19
19
  sleep 1 until @queue.empty? && @queue.num_waiting == @num_threads
20
20
  end
21
21
 
@@ -1,3 +1,3 @@
1
1
  module Outlander
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
data/outlander.gemspec CHANGED
@@ -24,5 +24,4 @@ Gem::Specification.new do |spec|
24
24
 
25
25
  spec.add_dependency "httparty"
26
26
  spec.add_dependency "nokogiri"
27
- spec.add_dependency "dalli"
28
27
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: outlander
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Terry Progetto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-05-08 00:00:00.000000000 Z
11
+ date: 2017-11-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,20 +66,6 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: dalli
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :runtime
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ">="
81
- - !ruby/object:Gem::Version
82
- version: '0'
83
69
  description: Multithreaded web crawler with transparent DSL and requests caching.
84
70
  email:
85
71
  - terryprogetto@gmail.com
@@ -87,16 +73,13 @@ executables: []
87
73
  extensions: []
88
74
  extra_rdoc_files: []
89
75
  files:
90
- - ".DS_Store"
91
76
  - ".gitignore"
92
77
  - Gemfile
93
78
  - README.md
94
79
  - Rakefile
95
80
  - bin/console
96
81
  - bin/setup
97
- - lib/.DS_Store
98
82
  - lib/outlander.rb
99
- - lib/outlander/.DS_Store
100
83
  - lib/outlander/agent.rb
101
84
  - lib/outlander/crawler.rb
102
85
  - lib/outlander/threads_pool.rb
@@ -121,7 +104,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
121
104
  version: '0'
122
105
  requirements: []
123
106
  rubyforge_project:
124
- rubygems_version: 2.6.11
107
+ rubygems_version: 2.6.13
125
108
  signing_key:
126
109
  specification_version: 4
127
110
  summary: Web pages crawler.
data/.DS_Store DELETED
Binary file
data/lib/.DS_Store DELETED
Binary file
Binary file