saper 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (111) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +126 -0
  4. data/Rakefile +17 -0
  5. data/bin/saper +60 -0
  6. data/lib/lib/json_search.rb +54 -0
  7. data/lib/lib/mechanize.rb +26 -0
  8. data/lib/lib/nokogiri.rb +12 -0
  9. data/lib/saper.rb +37 -0
  10. data/lib/saper/actions/append_with.rb +14 -0
  11. data/lib/saper/actions/convert_to_html.rb +14 -0
  12. data/lib/saper/actions/convert_to_json.rb +14 -0
  13. data/lib/saper/actions/convert_to_markdown.rb +13 -0
  14. data/lib/saper/actions/convert_to_time.rb +15 -0
  15. data/lib/saper/actions/convert_to_xml.rb +14 -0
  16. data/lib/saper/actions/create_atom.rb +18 -0
  17. data/lib/saper/actions/fetch.rb +17 -0
  18. data/lib/saper/actions/find.rb +18 -0
  19. data/lib/saper/actions/find_first.rb +16 -0
  20. data/lib/saper/actions/get_attribute.rb +15 -0
  21. data/lib/saper/actions/get_contents.rb +14 -0
  22. data/lib/saper/actions/get_text.rb +14 -0
  23. data/lib/saper/actions/prepend_with.rb +14 -0
  24. data/lib/saper/actions/remove_after.rb +14 -0
  25. data/lib/saper/actions/remove_before.rb +14 -0
  26. data/lib/saper/actions/remove_matching.rb +14 -0
  27. data/lib/saper/actions/remove_tags.rb +15 -0
  28. data/lib/saper/actions/replace.rb +15 -0
  29. data/lib/saper/actions/run_recipe.rb +24 -0
  30. data/lib/saper/actions/run_recipe_and_save.rb +22 -0
  31. data/lib/saper/actions/save.rb +14 -0
  32. data/lib/saper/actions/select_matching.rb +14 -0
  33. data/lib/saper/actions/set_input.rb +19 -0
  34. data/lib/saper/actions/skip_tags.rb +15 -0
  35. data/lib/saper/actions/split.rb +24 -0
  36. data/lib/saper/arguments/attribute.rb +11 -0
  37. data/lib/saper/arguments/recipe.rb +42 -0
  38. data/lib/saper/arguments/text.rb +11 -0
  39. data/lib/saper/arguments/timezone.rb +11 -0
  40. data/lib/saper/arguments/variable.rb +11 -0
  41. data/lib/saper/arguments/xpath.rb +11 -0
  42. data/lib/saper/core/action.rb +209 -0
  43. data/lib/saper/core/argument.rb +106 -0
  44. data/lib/saper/core/browser.rb +87 -0
  45. data/lib/saper/core/dsl.rb +68 -0
  46. data/lib/saper/core/error.rb +47 -0
  47. data/lib/saper/core/item.rb +70 -0
  48. data/lib/saper/core/keychain.rb +18 -0
  49. data/lib/saper/core/logger.rb +74 -0
  50. data/lib/saper/core/namespace.rb +139 -0
  51. data/lib/saper/core/recipe.rb +134 -0
  52. data/lib/saper/core/runtime.rb +237 -0
  53. data/lib/saper/core/type.rb +45 -0
  54. data/lib/saper/items/atom.rb +64 -0
  55. data/lib/saper/items/document.rb +66 -0
  56. data/lib/saper/items/html.rb +85 -0
  57. data/lib/saper/items/json.rb +67 -0
  58. data/lib/saper/items/markdown.rb +36 -0
  59. data/lib/saper/items/nothing.rb +15 -0
  60. data/lib/saper/items/text.rb +54 -0
  61. data/lib/saper/items/time.rb +42 -0
  62. data/lib/saper/items/url.rb +34 -0
  63. data/lib/saper/items/xml.rb +79 -0
  64. data/lib/saper/version.rb +3 -0
  65. data/spec/actions/append_with_spec.rb +30 -0
  66. data/spec/actions/convert_to_html_spec.rb +24 -0
  67. data/spec/actions/convert_to_json_spec.rb +24 -0
  68. data/spec/actions/convert_to_markdown_spec.rb +24 -0
  69. data/spec/actions/convert_to_time_spec.rb +37 -0
  70. data/spec/actions/convert_to_xml_spec.rb +24 -0
  71. data/spec/actions/create_atom_spec.rb +31 -0
  72. data/spec/actions/fetch_spec.rb +7 -0
  73. data/spec/actions/find_first_spec.rb +7 -0
  74. data/spec/actions/find_spec.rb +7 -0
  75. data/spec/actions/get_attribute_spec.rb +7 -0
  76. data/spec/actions/get_contents.rb +7 -0
  77. data/spec/actions/get_text.rb +7 -0
  78. data/spec/actions/prepend_with_spec.rb +30 -0
  79. data/spec/actions/remove_after.rb +7 -0
  80. data/spec/actions/remove_before.rb +7 -0
  81. data/spec/actions/replace_spec.rb +7 -0
  82. data/spec/actions/run_recipe_and_save_spec.tmp.rb +52 -0
  83. data/spec/actions/run_recipe_spec.tmp.rb +53 -0
  84. data/spec/actions/save_spec.rb +7 -0
  85. data/spec/actions/select_matching_spec.rb +7 -0
  86. data/spec/actions/set_input_spec.rb +7 -0
  87. data/spec/actions/skip_tags_spec.rb +7 -0
  88. data/spec/actions/split_spec.rb +7 -0
  89. data/spec/core/action_spec.rb +151 -0
  90. data/spec/core/argument_spec.rb +79 -0
  91. data/spec/core/browser_spec.rb +7 -0
  92. data/spec/core/dsl_spec.rb +7 -0
  93. data/spec/core/item_spec.rb +7 -0
  94. data/spec/core/keychain_spec.rb +7 -0
  95. data/spec/core/logger_spec.rb +7 -0
  96. data/spec/core/namespace_spec.rb +18 -0
  97. data/spec/core/recipe_spec.rb +81 -0
  98. data/spec/core/runtime_spec.rb +165 -0
  99. data/spec/core/type_spec.rb +7 -0
  100. data/spec/items/atom_spec.rb +7 -0
  101. data/spec/items/document_spec.rb +7 -0
  102. data/spec/items/html_spec.rb +7 -0
  103. data/spec/items/json_spec.rb +7 -0
  104. data/spec/items/markdown_spec.rb +7 -0
  105. data/spec/items/nothing_spec.rb +7 -0
  106. data/spec/items/text_spec.rb +17 -0
  107. data/spec/items/time_spec.rb +7 -0
  108. data/spec/items/url_spec.rb +7 -0
  109. data/spec/items/xml_spec.rb +17 -0
  110. data/spec/spec_helper.rb +22 -0
  111. metadata +355 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: be3fc3ae66117c4a6e4c28cb4a85834f9de5bea4
4
+ data.tar.gz: 41f8df60e4ee69d04868beef49a234218ec7e3d2
5
+ SHA512:
6
+ metadata.gz: 6533dd86c1d9b23cf20094666131d78197a89782a5d9ff6b4237127a14ab8f6be45727461592b2cad3a1d28074b8c4abce17f2da04c95498e5213d216f00f89b
7
+ data.tar.gz: 8b72ff711a4d6244e0ee8655fa40cd280a51961837a73604d090c014ea3f446a6d0ecfb434ed20df3d465038e52bea9ff2151e55aacb01cdf0c101fe267847e0
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2013 Merimond Corporation
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,126 @@
1
+ # Saper
2
+
3
+ Saper is a web automation library written in Ruby. It allows to crawl websites and
4
+ extract data in an efficient, controllable and fault-tolerant manner.
5
+
6
+ Common use scenarios:
7
+
8
+ - scrape a website and save data in a structured format
9
+ - create an RSS feed for a website or a web application
10
+ - create an API for a website that doesn't have one
11
+
12
+ ## Installing
13
+
14
+ Make sure you have Ruby and RubyGems, then run:
15
+
16
+ gem install saper
17
+
18
+ ## Recipes
19
+
20
+ Recipe is the core element of Saper. It is a chain of actions run consecutively, so
21
+ that each action processes output of the preceding action. You may create recipes
22
+ by instantiating Ruby classes, however using embedded DSL is the preferred method.
23
+
24
+ Here's a recipe that produces a list of recent Bloomberg articles in the 'worldwide
25
+ news' section (note that this data is unavailable via RSS):
26
+
27
+ recipe :bloomberg do
28
+ set_input "http://www.bloomberg.com/news/worldwide"
29
+ fetch
30
+ convert_to_html
31
+ find ".news_item a"
32
+ get_attribute "href"
33
+ prepend_with "http://bloomberg.com"
34
+ end
35
+
36
+ Given that file is saved as _myrecipe.txt_, you can now use the command line:
37
+
38
+ $ saper myrecipe.txt -recipe bloomberg
39
+
40
+ Alternatively, you can use Ruby:
41
+
42
+ #!/usr/bin/env ruby
43
+ Saper.run("myrecipe.txt", :bloomberg).results
44
+
45
+ ## Data flow
46
+
47
+ Data flows from one action to another so that output of each action is used as
48
+ input for the next one. Using the example above:
49
+
50
+ set_input "http://www.bloomberg.com/news/worldwide"
51
+ > String
52
+ fetch
53
+ > Document
54
+ convert_to_html
55
+ > HTML
56
+ find ".news_item a"
57
+ > [HTML, HTML, ... ]
58
+ get_attribute "href"
59
+ > [String, String, ... ]
60
+ prepend_with "http://bloomberg.com"
61
+ > [String, String, ... ]
62
+
63
+ Whenever an action returns multiple results (e.g. *find* returns multiple HTML nodes)
64
+ the following action will run several times as well. For instance, if *find* returns
65
+ 10 elements, then *get_attribute* will run 10 times (and produce 10 elements).
66
+
67
+ If any action fails (e.g. links has no *href* attribute), Saper will silently skip it
68
+ and proceed with the rest. All errors are logged and available for subsequent inspection,
69
+ but no error will ever stop the execution of a recipe -- this is the core idea behind Saper.
70
+
71
+ ## Available actions
72
+
73
+ Below is a list of all available actions:
74
+
75
+ ### Downloading information
76
+
77
+ - **fetch** - download data from URL.
78
+ - **convert_to_json** - parse downloaded data as JSON.
79
+ - **convert_to_html** - parse downloaded data as HTML.
80
+ - **convert_to_xml** - parse downloaded data as XML.
81
+
82
+ ### String manipulations
83
+
84
+ - **convert_to_time** (format, timezone) - convert string to _time_.
85
+ - **append_with** (string) - concatenate input with _string_.
86
+ - **prepend_with** (string) - concatenate _string_ with input.
87
+ - **remove_after** (separator) - search for _separator_ and remove part of input that follows the first occurrence.
88
+ - **remove_before** (separator) - search for _separator_ and remove part of input that precedes the first occurrence.
89
+ - **remove_matching** (regexp) - stop recipe execution for strings that don't match the specified pattern.
90
+ - **replace** (string, string) - substitute one block of text with another.
91
+ - **select_matching** (regexp) - continue recipe execution only for those strings that match the specified pattern.
92
+ - **split** (separator) - split string into multiple parts using specified _separator_.
93
+
94
+ ### HTML / XML manipulations
95
+
96
+ - **convert_to_markdown** - return tag contents converted to markdown.
97
+ - **find** (xpath) - return nodes matching specified XPath or CSS selector.
98
+ - **find_first** (xpath) - return the first node matching specified XPath or CSS selector.
99
+ - **get_attribute** (name) - returns value of specified attribute.
100
+ - **get_contents** - returns tag contents including any child tags (similar to inner_html).
101
+ - **get_text** - returns text contents of a tag, skipping any tags (similar to inner_text).
102
+ - **remove_tags** (name) - removes child tags including content.
103
+ - **skip_tags** (name) - removes child tags preserving their content.
104
+
105
+ ### Special-purpose actions
106
+
107
+ - **set_input** (string) - sets input for the following action.
108
+ - **create_atom** - create an Atom from saved variables.
109
+ - **run_recipe_and_save** (variable, recipe) - run another recipe and save its result as a variable.
110
+ - **run_recipe** (recipe) - run another recipe and use it's output as input for the next action.
111
+ - **save** (variable) - save input as a variable.
112
+
113
+ # Contributing
114
+
115
+ - Find something you would like to work on.
116
+ - Fork the project and do your work in a topic branch.
117
+ - Make sure your changes will work on both Ruby 1.8.7 and Ruby 1.9.
118
+ - Add tests in spec/ folder for the behavior you want to test.
119
+ - Run all the tests using _rake spec_.
120
+ - Commit your changes and send a pull request.
121
+
122
+ # License
123
+
124
+ Copyright (c) 2013 Merimond Corporation. MIT license, see [LICENSE] for details.
125
+
126
+ [LICENSE]: http://github.com/merimond/saper/blob/master/LICENSE
@@ -0,0 +1,17 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ require 'rspec/core/rake_task'
4
+ require 'yard'
5
+
6
+ Bundler.require
7
+ Bundler::GemHelper.install_tasks
8
+
9
+ desc "Run specs"
10
+ RSpec::Core::RakeTask.new do |t|
11
+ # nothing
12
+ end
13
+
14
+ desc "Generate docs"
15
+ YARD::Rake::YardocTask.new do |t|
16
+ t.files = ['lib/saper/**/*.rb']
17
+ end
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "saper"
4
+ require 'trollop'
5
+
6
+ opts = Trollop::options do
7
+ banner <<-EOS
8
+ Saper web automation library (version #{Saper::VERSION})
9
+ Usage: saper <filename> [options]
10
+
11
+ Options are ...
12
+ EOS
13
+ opt :input, "Recipe input", :type => :string
14
+ opt :recipe, "Recipe name", :type => :string
15
+ end
16
+
17
+ unless ARGV.size == 1
18
+ Trollop::die "Please specify filename"
19
+ end
20
+
21
+ path = File.expand_path(ARGV.first)
22
+
23
+ unless File.exists?(path) && File.file?(path)
24
+ Trollop::die("File not found")
25
+ end
26
+
27
+ begin
28
+ data = File.read(path)
29
+ rescue
30
+ Trollop::die("File cannot be read")
31
+ end
32
+
33
+ begin
34
+ space = Saper::Namespace.parse(data)
35
+ rescue NoMethodError => e
36
+ Trollop::die("Invalid command: %s" % e.message.match(/`(\w+)'/)[1])
37
+ end
38
+
39
+ if opts[:recipe].nil?
40
+ if space.run_by_default.nil?
41
+ Trollop::die("Recipe name undefined")
42
+ end
43
+ opts[:recipe] = space.run_by_default
44
+ end
45
+
46
+ begin
47
+ recipe = space[opts[:recipe]]
48
+ rescue Saper::RecipeNotFound => e
49
+ Trollop::die("`%s` recipe not found" % opts[:input])
50
+ rescue Saper::ActionNotFound => e
51
+ Trollop::die("Invalid action: %s" % e.message)
52
+ rescue Saper::InvalidArgument => e
53
+ Trolltop::die("Invalid action argument")
54
+ end
55
+
56
+ if opts[:input].nil? && recipe.input_required?
57
+ Trollop::die("No input specified, action requires %s" % recipe.input_required.inspect)
58
+ end
59
+
60
+ puts recipe.run(opts[:input]).results
@@ -0,0 +1,54 @@
1
+ # XPath-style library for JSON
2
+
3
+ module JSONSearch
4
+
5
+ def self.find(object, selector)
6
+ if object.is_a?(Array)
7
+ return object.map { |item|
8
+ find(item, selector)
9
+ }.flatten.compact
10
+ end
11
+ case selector
12
+ when /\A\/([^\/]+)/
13
+ name = $1
14
+ sub = selector.sub($&, "")
15
+ children(object, name).map { |child|
16
+ find(child, sub)
17
+ }.flatten.compact
18
+ when /\A\/\/([^\/]+)/
19
+ name = $1
20
+ sub = selector.sub($&, "")
21
+ descendants(object, name).map { |descendant|
22
+ find(descendant, sub)
23
+ }.flatten.compact
24
+ when /\A\z/
25
+ object
26
+ else
27
+ raise "Unsupported selector"
28
+ end
29
+ end
30
+
31
+ def self.children(object, name)
32
+ case object
33
+ when Array
34
+ object.map { |item|
35
+ children(item, name)
36
+ }.flatten
37
+ when Hash
38
+ if name == "*"
39
+ object.values
40
+ else
41
+ [object[name]]
42
+ end
43
+ else
44
+ []
45
+ end
46
+ end
47
+
48
+ def self.descendants(object, name)
49
+ children(object, "*").map { |child|
50
+ descendants(child, name)
51
+ }.push(*children(object, name))
52
+ end
53
+
54
+ end
@@ -0,0 +1,26 @@
1
+ # Monkey patching Mechanize
2
+ # - allow custom pre_connect_hooks and post_connect_hooks
3
+
4
+ class Mechanize
5
+
6
+ def pre_connect_hook(&block)
7
+ @agent.pre_connect_hook(&block)
8
+ end
9
+
10
+ def post_connect_hook(&block)
11
+ @agent.post_connect_hook(&block)
12
+ end
13
+
14
+ end
15
+
16
+ class Mechanize::HTTP::Agent
17
+
18
+ def pre_connect_hook(&block)
19
+ @pre_connect_hooks << block
20
+ end
21
+
22
+ def post_connect_hook(&block)
23
+ @post_connect_hooks << block
24
+ end
25
+
26
+ end
@@ -0,0 +1,12 @@
1
+ # Monkey patching Nokogiri:
2
+ # - correctly handle &#151; character
3
+
4
+ module Nokogiri
5
+ module HTML
6
+
7
+ def self.parse(thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
8
+ Document.parse(thing.gsub("&#151;", "—"), url, encoding, options, &block)
9
+ end
10
+
11
+ end
12
+ end
@@ -0,0 +1,37 @@
1
+ module Saper
2
+
3
+ def self.require_multiple(dir)
4
+ Dir[File.dirname(__FILE__) + "/" + dir].each { |file| require file }
5
+ end
6
+
7
+ require 'reverse_markdown'
8
+ require 'json'
9
+ require 'vremya'
10
+ require 'nokogiri'
11
+ require 'securerandom'
12
+ require 'mechanize'
13
+
14
+ require_relative "lib/json_search"
15
+ require_relative "lib/mechanize"
16
+ require_relative "lib/nokogiri"
17
+
18
+ require_relative "saper/version"
19
+ require_relative "saper/version"
20
+ require_relative "saper/core/item"
21
+ require_multiple "saper/items/*.rb"
22
+ require_relative "saper/core/argument"
23
+ require_multiple "saper/arguments/*.rb"
24
+ require_relative "saper/core/action"
25
+ require_multiple "saper/actions/*.rb"
26
+ require_relative "saper/core/error"
27
+ require_relative "saper/core/namespace"
28
+ require_relative "saper/core/type"
29
+ require_relative "saper/core/recipe"
30
+ require_relative "saper/core/error"
31
+ require_relative "saper/core/keychain"
32
+ require_relative "saper/core/dsl"
33
+ require_relative "saper/core/browser"
34
+ require_relative "saper/core/logger"
35
+ require_relative "saper/core/runtime"
36
+
37
+ end
@@ -0,0 +1,14 @@
1
+ module Saper
2
+ module Actions
3
+ class AppendWith < Action
4
+
5
+ argument :text
6
+ accepts :text, :returns => :text
7
+
8
+ run do |input, string|
9
+ "%s%s" % [input, string]
10
+ end
11
+
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ module Saper
2
+ module Actions
3
+ class ConvertToHTML < Action
4
+
5
+ accepts :text, :returns => :html
6
+ accepts :document, :returns => :html
7
+
8
+ run do |input|
9
+ input.to_html
10
+ end
11
+
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ module Saper
2
+ module Actions
3
+ class ConvertToJSON < Action
4
+
5
+ accepts :text, :returns => :json
6
+ accepts :document, :returns => :json
7
+
8
+ run do |input|
9
+ input.to_json
10
+ end
11
+
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,13 @@
1
+ module Saper
2
+ module Actions
3
+ class ConvertToMarkdown < Action
4
+
5
+ accepts :html, :returns => :markdown
6
+
7
+ run do |input|
8
+ input.to_markdown
9
+ end
10
+
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,15 @@
1
+ module Saper
2
+ module Actions
3
+ class ConvertToTime < Action
4
+
5
+ argument :text
6
+ argument :text, :optional => true
7
+ accepts :text, :returns => :time
8
+
9
+ run do |input, format, tz|
10
+ input.to_time(format, tz)
11
+ end
12
+
13
+ end
14
+ end
15
+ end