outlander 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6b6951e27ebb022fbc2ec65a1b7394e84d876868
4
- data.tar.gz: d77d4077c1cbdca9a6c09980c12016a9639b05d4
3
+ metadata.gz: 93eeac684cf867eacf0e54ee6d1967769e2a1c8e
4
+ data.tar.gz: 42d585420132fbdd11a665d9f89990e8ab3adfd6
5
5
  SHA512:
6
- metadata.gz: 0a4ceb2e8a3e564db5ec088855428711e66a63e903f35c58ed18f127cea3eebea2fa7af673b872494dceffdbc54c7672f21dc78a8f5ff276d9df4fe5106ce2e4
7
- data.tar.gz: 2da3b6e9e6939c0032004ef4fa71fea7d7ed2f65f6c20907bbfa318a7864beee8195c76821023f588669d8b1f355d8b3944c76d126584e3d1fe0f12caa010f27
6
+ metadata.gz: 88c97f88ec619bb42df78940234628d45cc2722b8a11257d02a441d99a0829a715bc5fbb1f56bf4f014275f07fc1b4daa8c426b02de9dce8fa5e7a03c8d72789
7
+ data.tar.gz: dbdfaa58aab3e29b194892204bc5b9b45dd35355af580e72d9dee2fc93a7f129ab42d451d174d929c044acc1786a5d3bade250fab5684b6aa64a1650daa180af
data/.gitignore CHANGED
@@ -7,3 +7,4 @@
7
7
  /pkg/
8
8
  /spec/reports/
9
9
  /tmp/
10
+ .DS_Store
data/lib/outlander.rb CHANGED
@@ -1,3 +1,2 @@
1
- require "outlander/version"
2
-
1
+ require 'outlander/version'
3
2
  require 'outlander/crawler'
@@ -5,37 +5,5 @@ module Outlander
5
5
  class Agent
6
6
 
7
7
  include HTTParty
8
-
9
- class << self
10
- attr_accessor :cache_storage
11
- end
12
-
13
- def self.fetch(path, options = {}, &block)
14
- if cache_enabled? && options[:cache]
15
- response_body = @cache_storage.get(path) || get(path, options, &block).body
16
- @cache_storage.set(path, response_body)
17
- response_body
18
- else
19
- get(path, options, &block).body
20
- end
21
- end
22
-
23
- def self.method_missing(m, *args, &block)
24
- if m.to_s.end_with? '_with_cache'
25
- if cache_enabled?
26
- response_body = @cache_storage.get(args[0]) || get(*args, &block).body
27
- @cache_storage.set(args[0], response_body)
28
- response_body
29
- else
30
- get(*args, &block).body
31
- end
32
- end
33
- end
34
-
35
- private
36
-
37
- def self.cache_enabled?
38
- !!@cache_storage
39
- end
40
8
  end
41
9
  end
@@ -1,5 +1,5 @@
1
1
  require 'nokogiri'
2
- require 'logger'
2
+ require 'thread'
3
3
  require 'outlander/agent'
4
4
  require 'outlander/threads_pool'
5
5
 
@@ -8,7 +8,8 @@ module Outlander
8
8
  module Crawler
9
9
 
10
10
  DEFAULT_OPTIONS = {
11
- num_threads: 3
11
+ num_threads: 3,
12
+ pause: 1
12
13
  }
13
14
 
14
15
  class << self
@@ -24,9 +25,9 @@ module Outlander
24
25
 
25
26
  module ClassMethods
26
27
 
27
- attr_reader :roots, :setup, :handlers
28
+ attr_reader :roots, :handlers, :setup
28
29
 
29
- def entrypoint(url, handler = :process_root)
30
+ def entry_point(url, handler = :process_root)
30
31
  @roots[url] = handler
31
32
  end
32
33
 
@@ -44,13 +45,13 @@ module Outlander
44
45
  end
45
46
 
46
47
  def initialize(options = {})
47
- agent.cache_storage = options.delete(:cache_storage)
48
- @logger = Logger.new(options.fetch(:log_to, STDOUT))
49
- @options = options.merge DEFAULT_OPTIONS
48
+ @cache = options.delete(:cache)
49
+ @options = DEFAULT_OPTIONS.merge options
50
50
  @history = {}
51
+ @mutex = Mutex.new
51
52
  @pool = ThreadsPool.new @options[:num_threads]
52
53
  self.class.roots.each do |url, handler|
53
- enqueue url, handler
54
+ discover url, handler
54
55
  end
55
56
  end
56
57
 
@@ -62,27 +63,46 @@ module Outlander
62
63
 
63
64
  private
64
65
 
65
- def record(data)
66
- @result_handler.call data
66
+ def record(*args)
67
+ @result_handler.call *args
67
68
  end
68
69
 
69
- def enqueue(url, handler, *args)
70
- return if @history[url] == handler
70
+ def discover(url, handler, *args)
71
+ @mutex.synchronize {
72
+ return if @history[url] == handler
73
+ @history[url] = handler
74
+ }
71
75
  @pool.enqueue do
72
76
  begin
73
- body = agent.get_with_cache(url)
77
+ body = fetch url do
78
+ sleep rand(@options[:pause])
79
+ @agent.get(url).body
80
+ end
74
81
  instance_exec Nokogiri::HTML(body), *args, &self.class.handlers[handler.to_sym]
75
82
  rescue => e
76
- @logger.error "Failed to process #{url} with ##{handler} #{e.inspect}"
77
- else
78
- @logger.info "Processed #{url} with ##{handler}"
83
+ puts "[ERROR] Failed to process #{url} with ##{handler} #{e.inspect}"
79
84
  end
80
85
  end
81
- @logger.info "Enqueued #{url} for ##{handler}"
82
86
  end
83
87
 
84
88
  def agent
85
- @agent ||= Agent.dup
89
+ @agent ||= Class.new(Agent)
90
+ end
91
+
92
+ def fetch(url, &block)
93
+ if cache_enabled?
94
+ return @cache.get url if @cache.exists url
95
+
96
+ value = block.call
97
+ @cache.set url, value
98
+ value
99
+ else
100
+ block.call
101
+ end
102
+ end
103
+
104
+ def cache_enabled?
105
+ !!@cache
86
106
  end
87
107
  end
88
108
  end
@@ -15,7 +15,7 @@ module Outlander
15
15
 
16
16
  def start
17
17
  raise "Could not start with empty queue" if @queue.empty?
18
- run_threads(@num_threads)
18
+ run_threads @num_threads
19
19
  sleep 1 until @queue.empty? && @queue.num_waiting == @num_threads
20
20
  end
21
21
 
@@ -1,3 +1,3 @@
1
1
  module Outlander
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
data/outlander.gemspec CHANGED
@@ -24,5 +24,4 @@ Gem::Specification.new do |spec|
24
24
 
25
25
  spec.add_dependency "httparty"
26
26
  spec.add_dependency "nokogiri"
27
- spec.add_dependency "dalli"
28
27
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: outlander
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Terry Progetto
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-05-08 00:00:00.000000000 Z
11
+ date: 2017-11-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,20 +66,6 @@ dependencies:
66
66
  - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: dalli
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :runtime
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ">="
81
- - !ruby/object:Gem::Version
82
- version: '0'
83
69
  description: Multithreaded web crawler with transparent DSL and requests caching.
84
70
  email:
85
71
  - terryprogetto@gmail.com
@@ -87,16 +73,13 @@ executables: []
87
73
  extensions: []
88
74
  extra_rdoc_files: []
89
75
  files:
90
- - ".DS_Store"
91
76
  - ".gitignore"
92
77
  - Gemfile
93
78
  - README.md
94
79
  - Rakefile
95
80
  - bin/console
96
81
  - bin/setup
97
- - lib/.DS_Store
98
82
  - lib/outlander.rb
99
- - lib/outlander/.DS_Store
100
83
  - lib/outlander/agent.rb
101
84
  - lib/outlander/crawler.rb
102
85
  - lib/outlander/threads_pool.rb
@@ -121,7 +104,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
121
104
  version: '0'
122
105
  requirements: []
123
106
  rubyforge_project:
124
- rubygems_version: 2.6.11
107
+ rubygems_version: 2.6.13
125
108
  signing_key:
126
109
  specification_version: 4
127
110
  summary: Web pages crawler.
data/.DS_Store DELETED
Binary file
data/lib/.DS_Store DELETED
Binary file
Binary file