unbreakable 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/unbreakable.rb +21 -4
- data/lib/unbreakable/processors/transform.rb +50 -3
- data/lib/unbreakable/scraper.rb +46 -16
- data/lib/unbreakable/version.rb +1 -1
- data/unbreakable.gemspec +1 -1
- metadata +9 -9
data/lib/unbreakable.rb
CHANGED
@@ -6,10 +6,10 @@ require 'dragonfly'
|
|
6
6
|
# You may enhance a datastore with {Decorators} and {Observers}: for example,
|
7
7
|
# a {Decorators::Timeout Timeout} decorator to retry on timeout with exponential
|
8
8
|
# backoff and a {Observers::Log Log} observer which logs retrieval progress.
|
9
|
-
# Of course, you must also define a {Processors
|
10
|
-
#
|
9
|
+
# Of course, you must also define a {Processors Processor} to turn your raw data
|
10
|
+
# into machine-readable data.
|
11
11
|
#
|
12
|
-
# A skeleton scraper:
|
12
|
+
# A simple skeleton scraper:
|
13
13
|
#
|
14
14
|
# require 'unbreakable'
|
15
15
|
#
|
@@ -40,7 +40,24 @@ require 'dragonfly'
|
|
40
40
|
#
|
41
41
|
# Every scraper script can run as a command-line script. Try it!
|
42
42
|
#
|
43
|
-
# ruby myscraper.rb
|
43
|
+
# $ ruby myscraper.rb
|
44
|
+
# usage: irb [options] <command> [<args>]
|
45
|
+
#
|
46
|
+
# The most commonly used commands are:
|
47
|
+
# retrieve Cache remote files to the datastore for later processing
|
48
|
+
# process Process cached files into machine-readable data
|
49
|
+
# config Print the current configuration
|
50
|
+
#
|
51
|
+
# Specific options:
|
52
|
+
# --root_path ARG default "/var/tmp/unbreakable"
|
53
|
+
# --[no-]store_meta default true
|
54
|
+
# --cache_duration ARG default 31536000
|
55
|
+
# --fallback_mime_type ARG default "application/octet-stream"
|
56
|
+
# --secret ARG default "secret yo"
|
57
|
+
# --[no-]trust_file_extensions default true
|
58
|
+
#
|
59
|
+
# General options:
|
60
|
+
# -h, --help Display this screen
|
44
61
|
module Unbreakable
|
45
62
|
autoload :Scraper, 'unbreakable/scraper'
|
46
63
|
|
@@ -1,6 +1,51 @@
|
|
1
1
|
module Unbreakable
|
2
|
+
# Processors are {http://markevans.github.com/dragonfly/file.Processing.html
|
3
|
+
# Dragonfly} processors. For example:
|
4
|
+
#
|
5
|
+
# class MyProcessor
|
6
|
+
# def coolify(temp_object, opts = {})
|
7
|
+
# SomeLib.coolify(temp_object.data, opts)
|
8
|
+
# end
|
9
|
+
#
|
10
|
+
# def uglify(temp_object, ugliness)
|
11
|
+
# `uglify -i #{temp_object.path} -u #{ugliness}`
|
12
|
+
# end
|
13
|
+
#
|
14
|
+
# def conditional(temp_object, format, pages)
|
15
|
+
# throw :unable_to_handle unless format == :pdf
|
16
|
+
# # do stuff
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# private
|
20
|
+
#
|
21
|
+
# def my_helper_method
|
22
|
+
# # do stuff
|
23
|
+
# end
|
24
|
+
# end
|
25
|
+
# MyScraper.processor.register MyProcessor
|
26
|
+
#
|
27
|
+
# Public methods must return an object with which a +TempObject+ may be
|
28
|
+
# initialized (+String+, +File+, +Tempfile+, +Pathname+ or +TempObject+).
|
29
|
+
#
|
30
|
+
# You can raise +Dragonfly::Configurable::NotConfigured+ if a configurable
|
31
|
+
# variable is required but missing. If a variable is invalid, you can raise
|
32
|
+
# +Dragonfly::Configurable::BadConfigAttribute+.
|
33
|
+
#
|
34
|
+
# If a process has dependencies or conditions, then you can test for these
|
35
|
+
# conditions and throw +:unable_to_handle+ to skip processing.
|
36
|
+
#
|
37
|
+
# If multiple processors define a public method by the same name, the methods
|
38
|
+
# will be run in reverse order from the last processor to define the method
|
39
|
+
# until one fails to throw +:unable_to_handle+. If all raise an error, then
|
40
|
+
# +Dragonfly::FunctionManager::UnableToHandle+ will be thrown.
|
41
|
+
#
|
42
|
+
# As such, if you are writing a document to plain-text converter, you can
|
43
|
+
# write a pdftotext processor, a doctopdf processor, etc. which all define
|
44
|
+
# a +to_text+ public method, and use +:unable_to_handle+ to make sure the
|
45
|
+
# correct processor runs.
|
2
46
|
module Processors
|
3
|
-
#
|
47
|
+
# If you are writing a simple scraper and only need one processor, you may
|
48
|
+
# implement a single +transform+ processor method by subclassing this class:
|
4
49
|
#
|
5
50
|
# require 'nokogiri'
|
6
51
|
# class MyProcessor < Unbreakable::Processors::Transform
|
@@ -21,8 +66,10 @@ module Unbreakable
|
|
21
66
|
# * +perform+
|
22
67
|
# * +persist+
|
23
68
|
#
|
24
|
-
#
|
25
|
-
#
|
69
|
+
# +transform+ calls +persist+ with the output of +perform+. This makes it
|
70
|
+
# easy for others to subclass your processor and just change the +persist+
|
71
|
+
# method to change the external database, for example, while still taking
|
72
|
+
# advantage of the hard work done by +perform+.
|
26
73
|
class Transform
|
27
74
|
include Dragonfly::Configurable
|
28
75
|
include Dragonfly::Loggable
|
data/lib/unbreakable/scraper.rb
CHANGED
@@ -2,7 +2,7 @@ require 'forwardable'
|
|
2
2
|
require 'optparse'
|
3
3
|
require 'securerandom'
|
4
4
|
|
5
|
-
require 'active_support/
|
5
|
+
require 'active_support/core_ext/class/attribute_accessors'
|
6
6
|
|
7
7
|
module Unbreakable
|
8
8
|
# You may implement a scraper by subclassing this class:
|
@@ -44,6 +44,9 @@ module Unbreakable
|
|
44
44
|
def_delegators :@app, :add_child_configurable, :configure, :datastore,
|
45
45
|
:fetch, :log, :processor
|
46
46
|
|
47
|
+
cattr_accessor :commands
|
48
|
+
@@commands = []
|
49
|
+
|
47
50
|
# Initializes a Dragonfly app for storage and processing.
|
48
51
|
def initialize
|
49
52
|
@app = Dragonfly[SecureRandom.hex.to_sym]
|
@@ -73,10 +76,12 @@ The most commonly used commands are:
|
|
73
76
|
|
74
77
|
@opts.separator ''
|
75
78
|
@opts.separator 'Specific options:'
|
79
|
+
specific_options
|
76
80
|
extract_configuration @app
|
77
81
|
|
78
82
|
@opts.separator ''
|
79
83
|
@opts.separator 'General options:'
|
84
|
+
general_options
|
80
85
|
@opts.on_tail('-h', '--help', 'Display this screen') do
|
81
86
|
puts @opts
|
82
87
|
exit
|
@@ -85,6 +90,24 @@ The most commonly used commands are:
|
|
85
90
|
@opts
|
86
91
|
end
|
87
92
|
|
93
|
+
# def specific_options
|
94
|
+
# @opts.on('--echo ARG', 'Write a string to standard output') do |x|
|
95
|
+
# puts x
|
96
|
+
# end
|
97
|
+
# end
|
98
|
+
#
|
99
|
+
# @abstract Override to add specific options to the option parser.
|
100
|
+
def specific_options; end
|
101
|
+
|
102
|
+
# def general_options
|
103
|
+
# @opts.on('--echo ARG', 'Write a string to standard output') do |x|
|
104
|
+
# puts x
|
105
|
+
# end
|
106
|
+
# end
|
107
|
+
#
|
108
|
+
# @abstract Override to add general options to the option parser.
|
109
|
+
def general_options; end
|
110
|
+
|
88
111
|
# Runs the command. Most often run from a command-line script as:
|
89
112
|
#
|
90
113
|
# scraper.run(ARGV)
|
@@ -104,7 +127,12 @@ The most commonly used commands are:
|
|
104
127
|
when nil
|
105
128
|
puts opts
|
106
129
|
else
|
107
|
-
|
130
|
+
# Allow subclasses to add more commands.
|
131
|
+
if self.commands.include? command.to_sym
|
132
|
+
send command, args
|
133
|
+
else
|
134
|
+
opts.abort "'#{command}' is not a #{opts.program_name} command. See '#{opts.program_name} --help'."
|
135
|
+
end
|
108
136
|
end
|
109
137
|
end
|
110
138
|
|
@@ -171,17 +199,18 @@ The most commonly used commands are:
|
|
171
199
|
|
172
200
|
# @param [#configuration] object
|
173
201
|
def extract_configuration(object)
|
174
|
-
object.
|
175
|
-
|
176
|
-
|
177
|
-
|
202
|
+
object.config_methods.each do |meth|
|
203
|
+
default = object.configuration[meth] || object.default_configuration[meth]
|
204
|
+
if true === default or false === default
|
205
|
+
@opts.on("--[no-]#{meth}", "default #{default.inspect}") do |x|
|
206
|
+
object.configure{|c| c.send "#{meth}=", x}
|
178
207
|
end
|
179
|
-
elsif String ===
|
180
|
-
@opts.on("--#{
|
181
|
-
object.send "#{
|
208
|
+
elsif String === default or Fixnum === default
|
209
|
+
@opts.on("--#{meth} ARG", "default #{default.inspect}") do |x|
|
210
|
+
object.configure{|c| c.send "#{meth}=", x}
|
182
211
|
end
|
183
|
-
elsif object !=
|
184
|
-
extract_configuration
|
212
|
+
elsif object != default and default.respond_to? :configuration
|
213
|
+
extract_configuration default
|
185
214
|
end
|
186
215
|
end
|
187
216
|
end
|
@@ -190,11 +219,12 @@ The most commonly used commands are:
|
|
190
219
|
def print_configuration(object, indent = 0)
|
191
220
|
indentation = ' ' * indent
|
192
221
|
puts "#{indentation}#{object.class.name}:"
|
193
|
-
object.
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
222
|
+
object.config_methods.each do |meth|
|
223
|
+
default = object.configuration[meth] || object.default_configuration[meth]
|
224
|
+
if true === default or false === default or String === default or Fixnum === default
|
225
|
+
puts " #{indentation}#{meth.to_s.ljust 25 - indent}#{default.inspect}"
|
226
|
+
elsif object != default and default.respond_to? :configuration
|
227
|
+
print_configuration default, indent + 2
|
198
228
|
end
|
199
229
|
end
|
200
230
|
end
|
data/lib/unbreakable/version.rb
CHANGED
data/unbreakable.gemspec
CHANGED
@@ -20,6 +20,6 @@ Gem::Specification.new do |s|
|
|
20
20
|
s.require_paths = ["lib"]
|
21
21
|
|
22
22
|
s.add_runtime_dependency('activesupport', '~> 3.1.0')
|
23
|
-
s.add_runtime_dependency('dragonfly', '~> 0.9.
|
23
|
+
s.add_runtime_dependency('dragonfly', '~> 0.9.8')
|
24
24
|
s.add_development_dependency('rspec', '~> 2.6.0')
|
25
25
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unbreakable
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-09-
|
12
|
+
date: 2011-09-09 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|
16
|
-
requirement: &
|
16
|
+
requirement: &70196241006160 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,21 +21,21 @@ dependencies:
|
|
21
21
|
version: 3.1.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70196241006160
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: dragonfly
|
27
|
-
requirement: &
|
27
|
+
requirement: &70196240974480 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
31
31
|
- !ruby/object:Gem::Version
|
32
|
-
version: 0.9.
|
32
|
+
version: 0.9.8
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70196240974480
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: rspec
|
38
|
-
requirement: &
|
38
|
+
requirement: &70196240973840 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: 2.6.0
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70196240973840
|
47
47
|
description: Abstracts and bulletproofs common scraping tasks.
|
48
48
|
email:
|
49
49
|
- info@opennorth.ca
|