outlander 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6b6951e27ebb022fbc2ec65a1b7394e84d876868
4
+ data.tar.gz: d77d4077c1cbdca9a6c09980c12016a9639b05d4
5
+ SHA512:
6
+ metadata.gz: 0a4ceb2e8a3e564db5ec088855428711e66a63e903f35c58ed18f127cea3eebea2fa7af673b872494dceffdbc54c7672f21dc78a8f5ff276d9df4fe5106ce2e4
7
+ data.tar.gz: 2da3b6e9e6939c0032004ef4fa71fea7d7ed2f65f6c20907bbfa318a7864beee8195c76821023f588669d8b1f355d8b3944c76d126584e3d1fe0f12caa010f27
data/.DS_Store ADDED
Binary file
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in outlander.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,36 @@
1
+ # Outlander
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/outlander`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'outlander'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install outlander
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/outlander.
36
+
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "outlander"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
data/lib/.DS_Store ADDED
Binary file
Binary file
@@ -0,0 +1,41 @@
1
+ require 'httparty'
2
+
3
+ module Outlander
4
+
5
+ class Agent
6
+
7
+ include HTTParty
8
+
9
+ class << self
10
+ attr_accessor :cache_storage
11
+ end
12
+
13
+ def self.fetch(path, options = {}, &block)
14
+ if cache_enabled? && options[:cache]
15
+ response_body = @cache_storage.get(path) || get(path, options, &block).body
16
+ @cache_storage.set(path, response_body)
17
+ response_body
18
+ else
19
+ get(path, options, &block).body
20
+ end
21
+ end
22
+
23
+ def self.method_missing(m, *args, &block)
24
+ if m.to_s.end_with? '_with_cache'
25
+ if cache_enabled?
26
+ response_body = @cache_storage.get(args[0]) || get(*args, &block).body
27
+ @cache_storage.set(args[0], response_body)
28
+ response_body
29
+ else
30
+ get(*args, &block).body
31
+ end
32
+ end
33
+ end
34
+
35
+ private
36
+
37
+ def self.cache_enabled?
38
+ !!@cache_storage
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,88 @@
1
+ require 'nokogiri'
2
+ require 'logger'
3
+ require 'outlander/agent'
4
+ require 'outlander/threads_pool'
5
+
6
+ module Outlander
7
+
8
+ module Crawler
9
+
10
+ DEFAULT_OPTIONS = {
11
+ num_threads: 3
12
+ }
13
+
14
+ class << self
15
+ def included(base)
16
+ base.extend ClassMethods
17
+
18
+ base.class_eval do
19
+ @roots = {}
20
+ @handlers = {}
21
+ end
22
+ end
23
+ end
24
+
25
+ module ClassMethods
26
+
27
+ attr_reader :roots, :setup, :handlers
28
+
29
+ def entrypoint(url, handler = :process_root)
30
+ @roots[url] = handler
31
+ end
32
+
33
+ def before_start(&block)
34
+ @setup = block
35
+ end
36
+
37
+ def method_missing(m, *args, &block)
38
+ if m.to_s.start_with? 'process_'
39
+ @handlers[m] = block
40
+ else
41
+ super
42
+ end
43
+ end
44
+ end
45
+
46
+ def initialize(options = {})
47
+ agent.cache_storage = options.delete(:cache_storage)
48
+ @logger = Logger.new(options.fetch(:log_to, STDOUT))
49
+ @options = options.merge DEFAULT_OPTIONS
50
+ @history = {}
51
+ @pool = ThreadsPool.new @options[:num_threads]
52
+ self.class.roots.each do |url, handler|
53
+ enqueue url, handler
54
+ end
55
+ end
56
+
57
+ def run!(&block)
58
+ @result_handler = block
59
+ instance_eval &self.class.setup
60
+ @pool.start
61
+ end
62
+
63
+ private
64
+
65
+ def record(data)
66
+ @result_handler.call data
67
+ end
68
+
69
+ def enqueue(url, handler, *args)
70
+ return if @history[url] == handler
71
+ @pool.enqueue do
72
+ begin
73
+ body = agent.get_with_cache(url)
74
+ instance_exec Nokogiri::HTML(body), *args, &self.class.handlers[handler.to_sym]
75
+ rescue => e
76
+ @logger.error "Failed to process #{url} with ##{handler} #{e.inspect}"
77
+ else
78
+ @logger.info "Processed #{url} with ##{handler}"
79
+ end
80
+ end
81
+ @logger.info "Enqueued #{url} for ##{handler}"
82
+ end
83
+
84
+ def agent
85
+ @agent ||= Agent.dup
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,34 @@
1
+ require 'thread'
2
+
3
+ module Outlander
4
+
5
+ class ThreadsPool
6
+
7
+ def initialize(num_threads = 3)
8
+ @num_threads = num_threads
9
+ @queue = Queue.new
10
+ end
11
+
12
+ def enqueue(&task)
13
+ @queue << task
14
+ end
15
+
16
+ def start
17
+ raise "Could not start with empty queue" if @queue.empty?
18
+ run_threads(@num_threads)
19
+ sleep 1 until @queue.empty? && @queue.num_waiting == @num_threads
20
+ end
21
+
22
+ private
23
+
24
+ def run_threads(num_threads = 1)
25
+ 1.upto(num_threads) do
26
+ Thread.new do
27
+ while task = @queue.pop
28
+ task.call
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,3 @@
1
+ module Outlander
2
+ VERSION = "0.1.0"
3
+ end
data/lib/outlander.rb ADDED
@@ -0,0 +1,3 @@
1
+ require "outlander/version"
2
+
3
+ require 'outlander/crawler'
data/outlander.gemspec ADDED
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'outlander/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "outlander"
8
+ spec.version = Outlander::VERSION
9
+ spec.authors = ["Terry Progetto"]
10
+ spec.email = ["terryprogetto@gmail.com"]
11
+
12
+ spec.summary = %q{Web pages crawler.}
13
+ spec.description = %q{Multithreaded web crawler with transparent DSL and requests caching.}
14
+
15
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
16
+ f.match(%r{^(test|spec|features)/})
17
+ end
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_development_dependency "bundler", "~> 1.14"
23
+ spec.add_development_dependency "rake", "~> 10.0"
24
+
25
+ spec.add_dependency "httparty"
26
+ spec.add_dependency "nokogiri"
27
+ spec.add_dependency "dalli"
28
+ end
metadata ADDED
@@ -0,0 +1,128 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: outlander
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Terry Progetto
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-05-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.14'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.14'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: httparty
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: nokogiri
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: dalli
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Multithreaded web crawler with transparent DSL and requests caching.
84
+ email:
85
+ - terryprogetto@gmail.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - ".DS_Store"
91
+ - ".gitignore"
92
+ - Gemfile
93
+ - README.md
94
+ - Rakefile
95
+ - bin/console
96
+ - bin/setup
97
+ - lib/.DS_Store
98
+ - lib/outlander.rb
99
+ - lib/outlander/.DS_Store
100
+ - lib/outlander/agent.rb
101
+ - lib/outlander/crawler.rb
102
+ - lib/outlander/threads_pool.rb
103
+ - lib/outlander/version.rb
104
+ - outlander.gemspec
105
+ homepage:
106
+ licenses: []
107
+ metadata: {}
108
+ post_install_message:
109
+ rdoc_options: []
110
+ require_paths:
111
+ - lib
112
+ required_ruby_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">="
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ required_rubygems_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - ">="
120
+ - !ruby/object:Gem::Version
121
+ version: '0'
122
+ requirements: []
123
+ rubyforge_project:
124
+ rubygems_version: 2.6.11
125
+ signing_key:
126
+ specification_version: 4
127
+ summary: Web pages crawler.
128
+ test_files: []