unbreakable 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/unbreakable.rb +21 -4
- data/lib/unbreakable/processors/transform.rb +50 -3
- data/lib/unbreakable/scraper.rb +46 -16
- data/lib/unbreakable/version.rb +1 -1
- data/unbreakable.gemspec +1 -1
- metadata +9 -9
data/lib/unbreakable.rb
CHANGED
@@ -6,10 +6,10 @@ require 'dragonfly'
|
|
6
6
|
# You may enhance a datastore with {Decorators} and {Observers}: for example,
|
7
7
|
# a {Decorators::Timeout Timeout} decorator to retry on timeout with exponential
|
8
8
|
# backoff and a {Observers::Log Log} observer which logs retrieval progress.
|
9
|
-
# Of course, you must also define a {Processors
|
10
|
-
#
|
9
|
+
# Of course, you must also define a {Processors Processor} to turn your raw data
|
10
|
+
# into machine-readable data.
|
11
11
|
#
|
12
|
-
# A skeleton scraper:
|
12
|
+
# A simple skeleton scraper:
|
13
13
|
#
|
14
14
|
# require 'unbreakable'
|
15
15
|
#
|
@@ -40,7 +40,24 @@ require 'dragonfly'
|
|
40
40
|
#
|
41
41
|
# Every scraper script can run as a command-line script. Try it!
|
42
42
|
#
|
43
|
-
# ruby myscraper.rb
|
43
|
+
# $ ruby myscraper.rb
|
44
|
+
# usage: irb [options] <command> [<args>]
|
45
|
+
#
|
46
|
+
# The most commonly used commands are:
|
47
|
+
# retrieve Cache remote files to the datastore for later processing
|
48
|
+
# process Process cached files into machine-readable data
|
49
|
+
# config Print the current configuration
|
50
|
+
#
|
51
|
+
# Specific options:
|
52
|
+
# --root_path ARG default "/var/tmp/unbreakable"
|
53
|
+
# --[no-]store_meta default true
|
54
|
+
# --cache_duration ARG default 31536000
|
55
|
+
# --fallback_mime_type ARG default "application/octet-stream"
|
56
|
+
# --secret ARG default "secret yo"
|
57
|
+
# --[no-]trust_file_extensions default true
|
58
|
+
#
|
59
|
+
# General options:
|
60
|
+
# -h, --help Display this screen
|
44
61
|
module Unbreakable
|
45
62
|
autoload :Scraper, 'unbreakable/scraper'
|
46
63
|
|
@@ -1,6 +1,51 @@
|
|
1
1
|
module Unbreakable
|
2
|
+
# Processors are {http://markevans.github.com/dragonfly/file.Processing.html
|
3
|
+
# Dragonfly} processors. For example:
|
4
|
+
#
|
5
|
+
# class MyProcessor
|
6
|
+
# def coolify(temp_object, opts = {})
|
7
|
+
# SomeLib.coolify(temp_object.data, opts)
|
8
|
+
# end
|
9
|
+
#
|
10
|
+
# def uglify(temp_object, ugliness)
|
11
|
+
# `uglify -i #{temp_object.path} -u #{ugliness}`
|
12
|
+
# end
|
13
|
+
#
|
14
|
+
# def conditional(temp_object, format, pages)
|
15
|
+
# throw :unable_to_handle unless format == :pdf
|
16
|
+
# # do stuff
|
17
|
+
# end
|
18
|
+
#
|
19
|
+
# private
|
20
|
+
#
|
21
|
+
# def my_helper_method
|
22
|
+
# # do stuff
|
23
|
+
# end
|
24
|
+
# end
|
25
|
+
# MyScraper.processor.register MyProcessor
|
26
|
+
#
|
27
|
+
# Public methods must return an object with which a +TempObject+ may be
|
28
|
+
# initialized (+String+, +File+, +Tempfile+, +Pathname+ or +TempObject+).
|
29
|
+
#
|
30
|
+
# You can raise +Dragonfly::Configurable::NotConfigured+ if a configurable
|
31
|
+
# variable is required but missing. If a variable is invalid, you can raise
|
32
|
+
# +Dragonfly::Configurable::BadConfigAttribute+.
|
33
|
+
#
|
34
|
+
# If a process has dependencies or conditions, then you can test for these
|
35
|
+
# conditions and throw +:unable_to_handle+ to skip processing.
|
36
|
+
#
|
37
|
+
# If multiple processors define a public method by the same name, the methods
|
38
|
+
# will be run in reverse order from the last processor to define the method
|
39
|
+
# until one fails to throw +:unable_to_handle+. If all raise an error, then
|
40
|
+
# +Dragonfly::FunctionManager::UnableToHandle+ will be thrown.
|
41
|
+
#
|
42
|
+
# As such, if you are writing a document to plain-text converter, you can
|
43
|
+
# write a pdftotext processor, a doctopdf processor, etc. which all define
|
44
|
+
# a +to_text+ public method, and use +:unable_to_handle+ to make sure the
|
45
|
+
# correct processor runs.
|
2
46
|
module Processors
|
3
|
-
#
|
47
|
+
# If you are writing a simple scraper and only need one processor, you may
|
48
|
+
# implement a single +transform+ processor method by subclassing this class:
|
4
49
|
#
|
5
50
|
# require 'nokogiri'
|
6
51
|
# class MyProcessor < Unbreakable::Processors::Transform
|
@@ -21,8 +66,10 @@ module Unbreakable
|
|
21
66
|
# * +perform+
|
22
67
|
# * +persist+
|
23
68
|
#
|
24
|
-
#
|
25
|
-
#
|
69
|
+
# +transform+ calls +persist+ with the output of +perform+. This makes it
|
70
|
+
# easy for others to subclass your processor and just change the +persist+
|
71
|
+
# method to change the external database, for example, while still taking
|
72
|
+
# advantage of the hard work done by +perform+.
|
26
73
|
class Transform
|
27
74
|
include Dragonfly::Configurable
|
28
75
|
include Dragonfly::Loggable
|
data/lib/unbreakable/scraper.rb
CHANGED
@@ -2,7 +2,7 @@ require 'forwardable'
|
|
2
2
|
require 'optparse'
|
3
3
|
require 'securerandom'
|
4
4
|
|
5
|
-
require 'active_support/
|
5
|
+
require 'active_support/core_ext/class/attribute_accessors'
|
6
6
|
|
7
7
|
module Unbreakable
|
8
8
|
# You may implement a scraper by subclassing this class:
|
@@ -44,6 +44,9 @@ module Unbreakable
|
|
44
44
|
def_delegators :@app, :add_child_configurable, :configure, :datastore,
|
45
45
|
:fetch, :log, :processor
|
46
46
|
|
47
|
+
cattr_accessor :commands
|
48
|
+
@@commands = []
|
49
|
+
|
47
50
|
# Initializes a Dragonfly app for storage and processing.
|
48
51
|
def initialize
|
49
52
|
@app = Dragonfly[SecureRandom.hex.to_sym]
|
@@ -73,10 +76,12 @@ The most commonly used commands are:
|
|
73
76
|
|
74
77
|
@opts.separator ''
|
75
78
|
@opts.separator 'Specific options:'
|
79
|
+
specific_options
|
76
80
|
extract_configuration @app
|
77
81
|
|
78
82
|
@opts.separator ''
|
79
83
|
@opts.separator 'General options:'
|
84
|
+
general_options
|
80
85
|
@opts.on_tail('-h', '--help', 'Display this screen') do
|
81
86
|
puts @opts
|
82
87
|
exit
|
@@ -85,6 +90,24 @@ The most commonly used commands are:
|
|
85
90
|
@opts
|
86
91
|
end
|
87
92
|
|
93
|
+
# def specific_options
|
94
|
+
# @opts.on('--echo ARG', 'Write a string to standard output') do |x|
|
95
|
+
# puts x
|
96
|
+
# end
|
97
|
+
# end
|
98
|
+
#
|
99
|
+
# @abstract Override to add specific options to the option parser.
|
100
|
+
def specific_options; end
|
101
|
+
|
102
|
+
# def general_options
|
103
|
+
# @opts.on('--echo ARG', 'Write a string to standard output') do |x|
|
104
|
+
# puts x
|
105
|
+
# end
|
106
|
+
# end
|
107
|
+
#
|
108
|
+
# @abstract Override to add general options to the option parser.
|
109
|
+
def general_options; end
|
110
|
+
|
88
111
|
# Runs the command. Most often run from a command-line script as:
|
89
112
|
#
|
90
113
|
# scraper.run(ARGV)
|
@@ -104,7 +127,12 @@ The most commonly used commands are:
|
|
104
127
|
when nil
|
105
128
|
puts opts
|
106
129
|
else
|
107
|
-
|
130
|
+
# Allow subclasses to add more commands.
|
131
|
+
if self.commands.include? command.to_sym
|
132
|
+
send command, args
|
133
|
+
else
|
134
|
+
opts.abort "'#{command}' is not a #{opts.program_name} command. See '#{opts.program_name} --help'."
|
135
|
+
end
|
108
136
|
end
|
109
137
|
end
|
110
138
|
|
@@ -171,17 +199,18 @@ The most commonly used commands are:
|
|
171
199
|
|
172
200
|
# @param [#configuration] object
|
173
201
|
def extract_configuration(object)
|
174
|
-
object.
|
175
|
-
|
176
|
-
|
177
|
-
|
202
|
+
object.config_methods.each do |meth|
|
203
|
+
default = object.configuration[meth] || object.default_configuration[meth]
|
204
|
+
if true === default or false === default
|
205
|
+
@opts.on("--[no-]#{meth}", "default #{default.inspect}") do |x|
|
206
|
+
object.configure{|c| c.send "#{meth}=", x}
|
178
207
|
end
|
179
|
-
elsif String ===
|
180
|
-
@opts.on("--#{
|
181
|
-
object.send "#{
|
208
|
+
elsif String === default or Fixnum === default
|
209
|
+
@opts.on("--#{meth} ARG", "default #{default.inspect}") do |x|
|
210
|
+
object.configure{|c| c.send "#{meth}=", x}
|
182
211
|
end
|
183
|
-
elsif object !=
|
184
|
-
extract_configuration
|
212
|
+
elsif object != default and default.respond_to? :configuration
|
213
|
+
extract_configuration default
|
185
214
|
end
|
186
215
|
end
|
187
216
|
end
|
@@ -190,11 +219,12 @@ The most commonly used commands are:
|
|
190
219
|
def print_configuration(object, indent = 0)
|
191
220
|
indentation = ' ' * indent
|
192
221
|
puts "#{indentation}#{object.class.name}:"
|
193
|
-
object.
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
222
|
+
object.config_methods.each do |meth|
|
223
|
+
default = object.configuration[meth] || object.default_configuration[meth]
|
224
|
+
if true === default or false === default or String === default or Fixnum === default
|
225
|
+
puts " #{indentation}#{meth.to_s.ljust 25 - indent}#{default.inspect}"
|
226
|
+
elsif object != default and default.respond_to? :configuration
|
227
|
+
print_configuration default, indent + 2
|
198
228
|
end
|
199
229
|
end
|
200
230
|
end
|
data/lib/unbreakable/version.rb
CHANGED
data/unbreakable.gemspec
CHANGED
@@ -20,6 +20,6 @@ Gem::Specification.new do |s|
|
|
20
20
|
s.require_paths = ["lib"]
|
21
21
|
|
22
22
|
s.add_runtime_dependency('activesupport', '~> 3.1.0')
|
23
|
-
s.add_runtime_dependency('dragonfly', '~> 0.9.
|
23
|
+
s.add_runtime_dependency('dragonfly', '~> 0.9.8')
|
24
24
|
s.add_development_dependency('rspec', '~> 2.6.0')
|
25
25
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: unbreakable
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-09-
|
12
|
+
date: 2011-09-09 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: activesupport
|
16
|
-
requirement: &
|
16
|
+
requirement: &70196241006160 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,21 +21,21 @@ dependencies:
|
|
21
21
|
version: 3.1.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70196241006160
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: dragonfly
|
27
|
-
requirement: &
|
27
|
+
requirement: &70196240974480 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
31
31
|
- !ruby/object:Gem::Version
|
32
|
-
version: 0.9.
|
32
|
+
version: 0.9.8
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70196240974480
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: rspec
|
38
|
-
requirement: &
|
38
|
+
requirement: &70196240973840 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: 2.6.0
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70196240973840
|
47
47
|
description: Abstracts and bulletproofs common scraping tasks.
|
48
48
|
email:
|
49
49
|
- info@opennorth.ca
|