saper 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +126 -0
- data/Rakefile +17 -0
- data/bin/saper +60 -0
- data/lib/lib/json_search.rb +54 -0
- data/lib/lib/mechanize.rb +26 -0
- data/lib/lib/nokogiri.rb +12 -0
- data/lib/saper.rb +37 -0
- data/lib/saper/actions/append_with.rb +14 -0
- data/lib/saper/actions/convert_to_html.rb +14 -0
- data/lib/saper/actions/convert_to_json.rb +14 -0
- data/lib/saper/actions/convert_to_markdown.rb +13 -0
- data/lib/saper/actions/convert_to_time.rb +15 -0
- data/lib/saper/actions/convert_to_xml.rb +14 -0
- data/lib/saper/actions/create_atom.rb +18 -0
- data/lib/saper/actions/fetch.rb +17 -0
- data/lib/saper/actions/find.rb +18 -0
- data/lib/saper/actions/find_first.rb +16 -0
- data/lib/saper/actions/get_attribute.rb +15 -0
- data/lib/saper/actions/get_contents.rb +14 -0
- data/lib/saper/actions/get_text.rb +14 -0
- data/lib/saper/actions/prepend_with.rb +14 -0
- data/lib/saper/actions/remove_after.rb +14 -0
- data/lib/saper/actions/remove_before.rb +14 -0
- data/lib/saper/actions/remove_matching.rb +14 -0
- data/lib/saper/actions/remove_tags.rb +15 -0
- data/lib/saper/actions/replace.rb +15 -0
- data/lib/saper/actions/run_recipe.rb +24 -0
- data/lib/saper/actions/run_recipe_and_save.rb +22 -0
- data/lib/saper/actions/save.rb +14 -0
- data/lib/saper/actions/select_matching.rb +14 -0
- data/lib/saper/actions/set_input.rb +19 -0
- data/lib/saper/actions/skip_tags.rb +15 -0
- data/lib/saper/actions/split.rb +24 -0
- data/lib/saper/arguments/attribute.rb +11 -0
- data/lib/saper/arguments/recipe.rb +42 -0
- data/lib/saper/arguments/text.rb +11 -0
- data/lib/saper/arguments/timezone.rb +11 -0
- data/lib/saper/arguments/variable.rb +11 -0
- data/lib/saper/arguments/xpath.rb +11 -0
- data/lib/saper/core/action.rb +209 -0
- data/lib/saper/core/argument.rb +106 -0
- data/lib/saper/core/browser.rb +87 -0
- data/lib/saper/core/dsl.rb +68 -0
- data/lib/saper/core/error.rb +47 -0
- data/lib/saper/core/item.rb +70 -0
- data/lib/saper/core/keychain.rb +18 -0
- data/lib/saper/core/logger.rb +74 -0
- data/lib/saper/core/namespace.rb +139 -0
- data/lib/saper/core/recipe.rb +134 -0
- data/lib/saper/core/runtime.rb +237 -0
- data/lib/saper/core/type.rb +45 -0
- data/lib/saper/items/atom.rb +64 -0
- data/lib/saper/items/document.rb +66 -0
- data/lib/saper/items/html.rb +85 -0
- data/lib/saper/items/json.rb +67 -0
- data/lib/saper/items/markdown.rb +36 -0
- data/lib/saper/items/nothing.rb +15 -0
- data/lib/saper/items/text.rb +54 -0
- data/lib/saper/items/time.rb +42 -0
- data/lib/saper/items/url.rb +34 -0
- data/lib/saper/items/xml.rb +79 -0
- data/lib/saper/version.rb +3 -0
- data/spec/actions/append_with_spec.rb +30 -0
- data/spec/actions/convert_to_html_spec.rb +24 -0
- data/spec/actions/convert_to_json_spec.rb +24 -0
- data/spec/actions/convert_to_markdown_spec.rb +24 -0
- data/spec/actions/convert_to_time_spec.rb +37 -0
- data/spec/actions/convert_to_xml_spec.rb +24 -0
- data/spec/actions/create_atom_spec.rb +31 -0
- data/spec/actions/fetch_spec.rb +7 -0
- data/spec/actions/find_first_spec.rb +7 -0
- data/spec/actions/find_spec.rb +7 -0
- data/spec/actions/get_attribute_spec.rb +7 -0
- data/spec/actions/get_contents.rb +7 -0
- data/spec/actions/get_text.rb +7 -0
- data/spec/actions/prepend_with_spec.rb +30 -0
- data/spec/actions/remove_after.rb +7 -0
- data/spec/actions/remove_before.rb +7 -0
- data/spec/actions/replace_spec.rb +7 -0
- data/spec/actions/run_recipe_and_save_spec.tmp.rb +52 -0
- data/spec/actions/run_recipe_spec.tmp.rb +53 -0
- data/spec/actions/save_spec.rb +7 -0
- data/spec/actions/select_matching_spec.rb +7 -0
- data/spec/actions/set_input_spec.rb +7 -0
- data/spec/actions/skip_tags_spec.rb +7 -0
- data/spec/actions/split_spec.rb +7 -0
- data/spec/core/action_spec.rb +151 -0
- data/spec/core/argument_spec.rb +79 -0
- data/spec/core/browser_spec.rb +7 -0
- data/spec/core/dsl_spec.rb +7 -0
- data/spec/core/item_spec.rb +7 -0
- data/spec/core/keychain_spec.rb +7 -0
- data/spec/core/logger_spec.rb +7 -0
- data/spec/core/namespace_spec.rb +18 -0
- data/spec/core/recipe_spec.rb +81 -0
- data/spec/core/runtime_spec.rb +165 -0
- data/spec/core/type_spec.rb +7 -0
- data/spec/items/atom_spec.rb +7 -0
- data/spec/items/document_spec.rb +7 -0
- data/spec/items/html_spec.rb +7 -0
- data/spec/items/json_spec.rb +7 -0
- data/spec/items/markdown_spec.rb +7 -0
- data/spec/items/nothing_spec.rb +7 -0
- data/spec/items/text_spec.rb +17 -0
- data/spec/items/time_spec.rb +7 -0
- data/spec/items/url_spec.rb +7 -0
- data/spec/items/xml_spec.rb +17 -0
- data/spec/spec_helper.rb +22 -0
- metadata +355 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: be3fc3ae66117c4a6e4c28cb4a85834f9de5bea4
|
4
|
+
data.tar.gz: 41f8df60e4ee69d04868beef49a234218ec7e3d2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 6533dd86c1d9b23cf20094666131d78197a89782a5d9ff6b4237127a14ab8f6be45727461592b2cad3a1d28074b8c4abce17f2da04c95498e5213d216f00f89b
|
7
|
+
data.tar.gz: 8b72ff711a4d6244e0ee8655fa40cd280a51961837a73604d090c014ea3f446a6d0ecfb434ed20df3d465038e52bea9ff2151e55aacb01cdf0c101fe267847e0
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2013 Merimond Corporation
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
# Saper
|
2
|
+
|
3
|
+
Saper is a web automation library written in Ruby. It allows to crawl websites and
|
4
|
+
extract data in an efficient, controllable and fault-tolerant manner.
|
5
|
+
|
6
|
+
Common use scenarios:
|
7
|
+
|
8
|
+
- scrape a website and save data in a structured format
|
9
|
+
- create an RSS feed for a website or a web application
|
10
|
+
- create an API for a website that doesn't have one
|
11
|
+
|
12
|
+
## Installing
|
13
|
+
|
14
|
+
Make sure you have Ruby and RubyGems, then run:
|
15
|
+
|
16
|
+
gem install saper
|
17
|
+
|
18
|
+
## Recipes
|
19
|
+
|
20
|
+
Recipe is the core element of Saper. It is a chain of actions run consecutively, so
|
21
|
+
that each action processes output of the preceding action. You may create recipes
|
22
|
+
by instantiating Ruby classes, however using embedded DSL is the preferred method.
|
23
|
+
|
24
|
+
Here's a recipe that produces a list of recent Bloomberg articles in the 'worldwide
|
25
|
+
news' section (note that this data is unavailable via RSS):
|
26
|
+
|
27
|
+
recipe :bloomberg do
|
28
|
+
set_input "http://www.bloomberg.com/news/worldwide"
|
29
|
+
fetch
|
30
|
+
convert_to_html
|
31
|
+
find ".news_item a"
|
32
|
+
get_attribute "href"
|
33
|
+
prepend_with "http://bloomberg.com"
|
34
|
+
end
|
35
|
+
|
36
|
+
Given that file is saved as _myrecipe.txt_, you can now use the command line:
|
37
|
+
|
38
|
+
$ saper myrecipe.txt -recipe bloomberg
|
39
|
+
|
40
|
+
Alternatively, you can use Ruby:
|
41
|
+
|
42
|
+
#!/usr/bin/env ruby
|
43
|
+
Saper.run("myrecipe.txt", :bloomberg).results
|
44
|
+
|
45
|
+
## Data flow
|
46
|
+
|
47
|
+
Data flows from one action to another so that output of each action is used as
|
48
|
+
input for the next one. Using the example above:
|
49
|
+
|
50
|
+
set_input "http://www.bloomberg.com/news/worldwide"
|
51
|
+
> String
|
52
|
+
fetch
|
53
|
+
> Document
|
54
|
+
convert_to_html
|
55
|
+
> HTML
|
56
|
+
find ".news_item a"
|
57
|
+
> [HTML, HTML, ... ]
|
58
|
+
get_attribute "href"
|
59
|
+
> [String, String, ... ]
|
60
|
+
prepend_with "http://bloomberg.com"
|
61
|
+
> [String, String, ... ]
|
62
|
+
|
63
|
+
Whenever an action returns multiple results (e.g. *find* returns multiple HTML nodes)
|
64
|
+
the following action will run several times as well. For instance, if *find* returns
|
65
|
+
10 elements, then *get_attribute* will run 10 times (and produce 10 elements).
|
66
|
+
|
67
|
+
If any action fails (e.g. links has no *href* attribute), Saper will silently skip it
|
68
|
+
and proceed with the rest. All errors are logged and available for subsequent inspection,
|
69
|
+
but no error will ever stop the execution of a recipe -- this is the core idea behind Saper.
|
70
|
+
|
71
|
+
## Available actions
|
72
|
+
|
73
|
+
Below is a list of all available actions:
|
74
|
+
|
75
|
+
### Downloading information
|
76
|
+
|
77
|
+
- **fetch** - download data from URL.
|
78
|
+
- **convert_to_json** - parse downloaded data as JSON.
|
79
|
+
- **convert_to_html** - parse downloaded data as HTML.
|
80
|
+
- **convert_to_xml** - parse downloaded data as XML.
|
81
|
+
|
82
|
+
### String manipulations
|
83
|
+
|
84
|
+
- **convert_to_time** (format, timezone) - convert string to _time_.
|
85
|
+
- **append_with** (string) - concatenate input with _string_.
|
86
|
+
- **prepend_with** (string) - concatenate _string_ with input.
|
87
|
+
- **remove_after** (separator) - search for _separator_ and remove part of input that follows the first occurrence.
|
88
|
+
- **remove_before** (separator) - search for _separator_ and remove part of input that precedes the first occurrence.
|
89
|
+
- **remove_matching** (regexp) - stop recipe execution for strings that don't match the specified pattern.
|
90
|
+
- **replace** (string, string) - substitute one block of text with another.
|
91
|
+
- **select_matching** (regexp) - continue recipe execution only for those strings that match the specified pattern.
|
92
|
+
- **split** (separator) - split string into multiple parts using specified _separator_.
|
93
|
+
|
94
|
+
### HTML / XML manipulations
|
95
|
+
|
96
|
+
- **convert_to_markdown** - return tag contents converted to markdown.
|
97
|
+
- **find** (xpath) - return nodes matching specified XPath or CSS selector.
|
98
|
+
- **find_first** (xpath) - return the first node matching specified XPath or CSS selector.
|
99
|
+
- **get_attribute** (name) - returns value of specified attribute.
|
100
|
+
- **get_contents** - returns tag contents including any child tags (similar to inner_html).
|
101
|
+
- **get_text** - returns text contents of a tag, skipping any tags (similar to inner_text).
|
102
|
+
- **remove_tags** (name) - removes child tags including content.
|
103
|
+
- **skip_tags** (name) - removes child tags preserving their content.
|
104
|
+
|
105
|
+
### Special-purpose actions
|
106
|
+
|
107
|
+
- **set_input** (string) - sets input for the following action.
|
108
|
+
- **create_atom** - create an Atom from saved variables.
|
109
|
+
- **run_recipe_and_save** (variable, recipe) - run another recipe and save its result as a variable.
|
110
|
+
- **run_recipe** (recipe) - run another recipe and use it's output as input for the next action.
|
111
|
+
- **save** (variable) - save input as a variable.
|
112
|
+
|
113
|
+
# Contributing
|
114
|
+
|
115
|
+
- Find something you would like to work on.
|
116
|
+
- Fork the project and do your work in a topic branch.
|
117
|
+
- Make sure your changes will work on both Ruby 1.8.7 and Ruby 1.9.
|
118
|
+
- Add tests in spec/ folder for the behavior you want to test.
|
119
|
+
- Run all the tests using _rake spec_.
|
120
|
+
- Commit your changes and send a pull request.
|
121
|
+
|
122
|
+
# License
|
123
|
+
|
124
|
+
Copyright (c) 2013 Merimond Corporation. MIT license, see [LICENSE] for details.
|
125
|
+
|
126
|
+
[LICENSE]: http://github.com/merimond/saper/blob/master/LICENSE
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
require 'rspec/core/rake_task'
|
4
|
+
require 'yard'
|
5
|
+
|
6
|
+
Bundler.require
|
7
|
+
Bundler::GemHelper.install_tasks
|
8
|
+
|
9
|
+
desc "Run specs"
|
10
|
+
RSpec::Core::RakeTask.new do |t|
|
11
|
+
# nothing
|
12
|
+
end
|
13
|
+
|
14
|
+
desc "Generate docs"
|
15
|
+
YARD::Rake::YardocTask.new do |t|
|
16
|
+
t.files = ['lib/saper/**/*.rb']
|
17
|
+
end
|
data/bin/saper
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "saper"
|
4
|
+
require 'trollop'
|
5
|
+
|
6
|
+
opts = Trollop::options do
|
7
|
+
banner <<-EOS
|
8
|
+
Saper web automation library (version #{Saper::VERSION})
|
9
|
+
Usage: saper <filename> [options]
|
10
|
+
|
11
|
+
Options are ...
|
12
|
+
EOS
|
13
|
+
opt :input, "Recipe input", :type => :string
|
14
|
+
opt :recipe, "Recipe name", :type => :string
|
15
|
+
end
|
16
|
+
|
17
|
+
unless ARGV.size == 1
|
18
|
+
Trollop::die "Please specify filename"
|
19
|
+
end
|
20
|
+
|
21
|
+
path = File.expand_path(ARGV.first)
|
22
|
+
|
23
|
+
unless File.exists?(path) && File.file?(path)
|
24
|
+
Trollop::die("File not found")
|
25
|
+
end
|
26
|
+
|
27
|
+
begin
|
28
|
+
data = File.read(path)
|
29
|
+
rescue
|
30
|
+
Trollop::die("File cannot be read")
|
31
|
+
end
|
32
|
+
|
33
|
+
begin
|
34
|
+
space = Saper::Namespace.parse(data)
|
35
|
+
rescue NoMethodError => e
|
36
|
+
Trollop::die("Invalid command: %s" % e.message.match(/`(\w+)'/)[1])
|
37
|
+
end
|
38
|
+
|
39
|
+
if opts[:recipe].nil?
|
40
|
+
if space.run_by_default.nil?
|
41
|
+
Trollop::die("Recipe name undefined")
|
42
|
+
end
|
43
|
+
opts[:recipe] = space.run_by_default
|
44
|
+
end
|
45
|
+
|
46
|
+
begin
|
47
|
+
recipe = space[opts[:recipe]]
|
48
|
+
rescue Saper::RecipeNotFound => e
|
49
|
+
Trollop::die("`%s` recipe not found" % opts[:input])
|
50
|
+
rescue Saper::ActionNotFound => e
|
51
|
+
Trollop::die("Invalid action: %s" % e.message)
|
52
|
+
rescue Saper::InvalidArgument => e
|
53
|
+
Trolltop::die("Invalid action argument")
|
54
|
+
end
|
55
|
+
|
56
|
+
if opts[:input].nil? && recipe.input_required?
|
57
|
+
Trollop::die("No input specified, action requires %s" % recipe.input_required.inspect)
|
58
|
+
end
|
59
|
+
|
60
|
+
puts recipe.run(opts[:input]).results
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# XPath-style library for JSON
|
2
|
+
|
3
|
+
module JSONSearch
|
4
|
+
|
5
|
+
def self.find(object, selector)
|
6
|
+
if object.is_a?(Array)
|
7
|
+
return object.map { |item|
|
8
|
+
find(item, selector)
|
9
|
+
}.flatten.compact
|
10
|
+
end
|
11
|
+
case selector
|
12
|
+
when /\A\/([^\/]+)/
|
13
|
+
name = $1
|
14
|
+
sub = selector.sub($&, "")
|
15
|
+
children(object, name).map { |child|
|
16
|
+
find(child, sub)
|
17
|
+
}.flatten.compact
|
18
|
+
when /\A\/\/([^\/]+)/
|
19
|
+
name = $1
|
20
|
+
sub = selector.sub($&, "")
|
21
|
+
descendants(object, name).map { |descendant|
|
22
|
+
find(descendant, sub)
|
23
|
+
}.flatten.compact
|
24
|
+
when /\A\z/
|
25
|
+
object
|
26
|
+
else
|
27
|
+
raise "Unsupported selector"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.children(object, name)
|
32
|
+
case object
|
33
|
+
when Array
|
34
|
+
object.map { |item|
|
35
|
+
children(item, name)
|
36
|
+
}.flatten
|
37
|
+
when Hash
|
38
|
+
if name == "*"
|
39
|
+
object.values
|
40
|
+
else
|
41
|
+
[object[name]]
|
42
|
+
end
|
43
|
+
else
|
44
|
+
[]
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.descendants(object, name)
|
49
|
+
children(object, "*").map { |child|
|
50
|
+
descendants(child, name)
|
51
|
+
}.push(*children(object, name))
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# Monkey patching Mechanize
|
2
|
+
# - allow custom pre_connect_hooks and post_connect_hooks
|
3
|
+
|
4
|
+
class Mechanize
|
5
|
+
|
6
|
+
def pre_connect_hook(&block)
|
7
|
+
@agent.pre_connect_hook(&block)
|
8
|
+
end
|
9
|
+
|
10
|
+
def post_connect_hook(&block)
|
11
|
+
@agent.post_connect_hook(&block)
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
class Mechanize::HTTP::Agent
|
17
|
+
|
18
|
+
def pre_connect_hook(&block)
|
19
|
+
@pre_connect_hooks << block
|
20
|
+
end
|
21
|
+
|
22
|
+
def post_connect_hook(&block)
|
23
|
+
@post_connect_hooks << block
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
data/lib/lib/nokogiri.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
# Monkey patching Nokogiri:
|
2
|
+
# - correctly handle — character
|
3
|
+
|
4
|
+
module Nokogiri
|
5
|
+
module HTML
|
6
|
+
|
7
|
+
def self.parse(thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
|
8
|
+
Document.parse(thing.gsub("—", "—"), url, encoding, options, &block)
|
9
|
+
end
|
10
|
+
|
11
|
+
end
|
12
|
+
end
|
data/lib/saper.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
module Saper
|
2
|
+
|
3
|
+
def self.require_multiple(dir)
|
4
|
+
Dir[File.dirname(__FILE__) + "/" + dir].each { |file| require file }
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'reverse_markdown'
|
8
|
+
require 'json'
|
9
|
+
require 'vremya'
|
10
|
+
require 'nokogiri'
|
11
|
+
require 'securerandom'
|
12
|
+
require 'mechanize'
|
13
|
+
|
14
|
+
require_relative "lib/json_search"
|
15
|
+
require_relative "lib/mechanize"
|
16
|
+
require_relative "lib/nokogiri"
|
17
|
+
|
18
|
+
require_relative "saper/version"
|
19
|
+
require_relative "saper/version"
|
20
|
+
require_relative "saper/core/item"
|
21
|
+
require_multiple "saper/items/*.rb"
|
22
|
+
require_relative "saper/core/argument"
|
23
|
+
require_multiple "saper/arguments/*.rb"
|
24
|
+
require_relative "saper/core/action"
|
25
|
+
require_multiple "saper/actions/*.rb"
|
26
|
+
require_relative "saper/core/error"
|
27
|
+
require_relative "saper/core/namespace"
|
28
|
+
require_relative "saper/core/type"
|
29
|
+
require_relative "saper/core/recipe"
|
30
|
+
require_relative "saper/core/error"
|
31
|
+
require_relative "saper/core/keychain"
|
32
|
+
require_relative "saper/core/dsl"
|
33
|
+
require_relative "saper/core/browser"
|
34
|
+
require_relative "saper/core/logger"
|
35
|
+
require_relative "saper/core/runtime"
|
36
|
+
|
37
|
+
end
|