saper 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +126 -0
  4. data/Rakefile +17 -0
  5. data/bin/saper +60 -0
  6. data/lib/lib/json_search.rb +54 -0
  7. data/lib/lib/mechanize.rb +26 -0
  8. data/lib/lib/nokogiri.rb +12 -0
  9. data/lib/saper.rb +37 -0
  10. data/lib/saper/actions/append_with.rb +14 -0
  11. data/lib/saper/actions/convert_to_html.rb +14 -0
  12. data/lib/saper/actions/convert_to_json.rb +14 -0
  13. data/lib/saper/actions/convert_to_markdown.rb +13 -0
  14. data/lib/saper/actions/convert_to_time.rb +15 -0
  15. data/lib/saper/actions/convert_to_xml.rb +14 -0
  16. data/lib/saper/actions/create_atom.rb +18 -0
  17. data/lib/saper/actions/fetch.rb +17 -0
  18. data/lib/saper/actions/find.rb +18 -0
  19. data/lib/saper/actions/find_first.rb +16 -0
  20. data/lib/saper/actions/get_attribute.rb +15 -0
  21. data/lib/saper/actions/get_contents.rb +14 -0
  22. data/lib/saper/actions/get_text.rb +14 -0
  23. data/lib/saper/actions/prepend_with.rb +14 -0
  24. data/lib/saper/actions/remove_after.rb +14 -0
  25. data/lib/saper/actions/remove_before.rb +14 -0
  26. data/lib/saper/actions/remove_matching.rb +14 -0
  27. data/lib/saper/actions/remove_tags.rb +15 -0
  28. data/lib/saper/actions/replace.rb +15 -0
  29. data/lib/saper/actions/run_recipe.rb +24 -0
  30. data/lib/saper/actions/run_recipe_and_save.rb +22 -0
  31. data/lib/saper/actions/save.rb +14 -0
  32. data/lib/saper/actions/select_matching.rb +14 -0
  33. data/lib/saper/actions/set_input.rb +19 -0
  34. data/lib/saper/actions/skip_tags.rb +15 -0
  35. data/lib/saper/actions/split.rb +24 -0
  36. data/lib/saper/arguments/attribute.rb +11 -0
  37. data/lib/saper/arguments/recipe.rb +42 -0
  38. data/lib/saper/arguments/text.rb +11 -0
  39. data/lib/saper/arguments/timezone.rb +11 -0
  40. data/lib/saper/arguments/variable.rb +11 -0
  41. data/lib/saper/arguments/xpath.rb +11 -0
  42. data/lib/saper/core/action.rb +209 -0
  43. data/lib/saper/core/argument.rb +106 -0
  44. data/lib/saper/core/browser.rb +87 -0
  45. data/lib/saper/core/dsl.rb +68 -0
  46. data/lib/saper/core/error.rb +47 -0
  47. data/lib/saper/core/item.rb +70 -0
  48. data/lib/saper/core/keychain.rb +18 -0
  49. data/lib/saper/core/logger.rb +74 -0
  50. data/lib/saper/core/namespace.rb +139 -0
  51. data/lib/saper/core/recipe.rb +134 -0
  52. data/lib/saper/core/runtime.rb +237 -0
  53. data/lib/saper/core/type.rb +45 -0
  54. data/lib/saper/items/atom.rb +64 -0
  55. data/lib/saper/items/document.rb +66 -0
  56. data/lib/saper/items/html.rb +85 -0
  57. data/lib/saper/items/json.rb +67 -0
  58. data/lib/saper/items/markdown.rb +36 -0
  59. data/lib/saper/items/nothing.rb +15 -0
  60. data/lib/saper/items/text.rb +54 -0
  61. data/lib/saper/items/time.rb +42 -0
  62. data/lib/saper/items/url.rb +34 -0
  63. data/lib/saper/items/xml.rb +79 -0
  64. data/lib/saper/version.rb +3 -0
  65. data/spec/actions/append_with_spec.rb +30 -0
  66. data/spec/actions/convert_to_html_spec.rb +24 -0
  67. data/spec/actions/convert_to_json_spec.rb +24 -0
  68. data/spec/actions/convert_to_markdown_spec.rb +24 -0
  69. data/spec/actions/convert_to_time_spec.rb +37 -0
  70. data/spec/actions/convert_to_xml_spec.rb +24 -0
  71. data/spec/actions/create_atom_spec.rb +31 -0
  72. data/spec/actions/fetch_spec.rb +7 -0
  73. data/spec/actions/find_first_spec.rb +7 -0
  74. data/spec/actions/find_spec.rb +7 -0
  75. data/spec/actions/get_attribute_spec.rb +7 -0
  76. data/spec/actions/get_contents.rb +7 -0
  77. data/spec/actions/get_text.rb +7 -0
  78. data/spec/actions/prepend_with_spec.rb +30 -0
  79. data/spec/actions/remove_after.rb +7 -0
  80. data/spec/actions/remove_before.rb +7 -0
  81. data/spec/actions/replace_spec.rb +7 -0
  82. data/spec/actions/run_recipe_and_save_spec.tmp.rb +52 -0
  83. data/spec/actions/run_recipe_spec.tmp.rb +53 -0
  84. data/spec/actions/save_spec.rb +7 -0
  85. data/spec/actions/select_matching_spec.rb +7 -0
  86. data/spec/actions/set_input_spec.rb +7 -0
  87. data/spec/actions/skip_tags_spec.rb +7 -0
  88. data/spec/actions/split_spec.rb +7 -0
  89. data/spec/core/action_spec.rb +151 -0
  90. data/spec/core/argument_spec.rb +79 -0
  91. data/spec/core/browser_spec.rb +7 -0
  92. data/spec/core/dsl_spec.rb +7 -0
  93. data/spec/core/item_spec.rb +7 -0
  94. data/spec/core/keychain_spec.rb +7 -0
  95. data/spec/core/logger_spec.rb +7 -0
  96. data/spec/core/namespace_spec.rb +18 -0
  97. data/spec/core/recipe_spec.rb +81 -0
  98. data/spec/core/runtime_spec.rb +165 -0
  99. data/spec/core/type_spec.rb +7 -0
  100. data/spec/items/atom_spec.rb +7 -0
  101. data/spec/items/document_spec.rb +7 -0
  102. data/spec/items/html_spec.rb +7 -0
  103. data/spec/items/json_spec.rb +7 -0
  104. data/spec/items/markdown_spec.rb +7 -0
  105. data/spec/items/nothing_spec.rb +7 -0
  106. data/spec/items/text_spec.rb +17 -0
  107. data/spec/items/time_spec.rb +7 -0
  108. data/spec/items/url_spec.rb +7 -0
  109. data/spec/items/xml_spec.rb +17 -0
  110. data/spec/spec_helper.rb +22 -0
  111. metadata +355 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: be3fc3ae66117c4a6e4c28cb4a85834f9de5bea4
4
+ data.tar.gz: 41f8df60e4ee69d04868beef49a234218ec7e3d2
5
+ SHA512:
6
+ metadata.gz: 6533dd86c1d9b23cf20094666131d78197a89782a5d9ff6b4237127a14ab8f6be45727461592b2cad3a1d28074b8c4abce17f2da04c95498e5213d216f00f89b
7
+ data.tar.gz: 8b72ff711a4d6244e0ee8655fa40cd280a51961837a73604d090c014ea3f446a6d0ecfb434ed20df3d465038e52bea9ff2151e55aacb01cdf0c101fe267847e0
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2013 Merimond Corporation
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,126 @@
1
+ # Saper
2
+
3
+ Saper is a web automation library written in Ruby. It allows to crawl websites and
4
+ extract data in an efficient, controllable and fault-tolerant manner.
5
+
6
+ Common use scenarios:
7
+
8
+ - scrape a website and save data in a structured format
9
+ - create an RSS feed for a website or a web application
10
+ - create an API for a website that doesn't have one
11
+
12
+ ## Installing
13
+
14
+ Make sure you have Ruby and RubyGems, then run:
15
+
16
+ gem install saper
17
+
18
+ ## Recipes
19
+
20
+ Recipe is the core element of Saper. It is a chain of actions run consecutively, so
21
+ that each action processes output of the preceding action. You may create recipes
22
+ by instantiating Ruby classes, however using embedded DSL is the preferred method.
23
+
24
+ Here's a recipe that produces a list of recent Bloomberg articles in the 'worldwide
25
+ news' section (note that this data is unavailable via RSS):
26
+
27
+ recipe :bloomberg do
28
+ set_input "http://www.bloomberg.com/news/worldwide"
29
+ fetch
30
+ convert_to_html
31
+ find ".news_item a"
32
+ get_attribute "href"
33
+ prepend_with "http://bloomberg.com"
34
+ end
35
+
36
+ Given that file is saved as _myrecipe.txt_, you can now use the command line:
37
+
38
+ $ saper myrecipe.txt -recipe bloomberg
39
+
40
+ Alternatively, you can use Ruby:
41
+
42
+ #!/usr/bin/env ruby
43
+ Saper.run("myrecipe.txt", :bloomberg).results
44
+
45
+ ## Data flow
46
+
47
+ Data flows from one action to another so that output of each action is used as
48
+ input for the next one. Using the example above:
49
+
50
+ set_input "http://www.bloomberg.com/news/worldwide"
51
+ > String
52
+ fetch
53
+ > Document
54
+ convert_to_html
55
+ > HTML
56
+ find ".news_item a"
57
+ > [HTML, HTML, ... ]
58
+ get_attribute "href"
59
+ > [String, String, ... ]
60
+ prepend_with "http://bloomberg.com"
61
+ > [String, String, ... ]
62
+
63
+ Whenever an action returns multiple results (e.g. *find* returns multiple HTML nodes)
64
+ the following action will run several times as well. For instance, if *find* returns
65
+ 10 elements, then *get_attribute* will run 10 times (and produce 10 elements).
66
+
67
+ If any action fails (e.g. links has no *href* attribute), Saper will silently skip it
68
+ and proceed with the rest. All errors are logged and available for subsequent inspection,
69
+ but no error will ever stop the execution of a recipe -- this is the core idea behind Saper.
70
+
71
+ ## Available actions
72
+
73
+ Below is a list of all available actions:
74
+
75
+ ### Downloading information
76
+
77
+ - **fetch** - download data from URL.
78
+ - **convert_to_json** - parse downloaded data as JSON.
79
+ - **convert_to_html** - parse downloaded data as HTML.
80
+ - **convert_to_xml** - parse downloaded data as XML.
81
+
82
+ ### String manipulations
83
+
84
+ - **convert_to_time** (format, timezone) - convert string to _time_.
85
+ - **append_with** (string) - concatenate input with _string_.
86
+ - **prepend_with** (string) - concatenate _string_ with input.
87
+ - **remove_after** (separator) - search for _separator_ and remove part of input that follows the first occurrence.
88
+ - **remove_before** (separator) - search for _separator_ and remove part of input that precedes the first occurrence.
89
+ - **remove_matching** (regexp) - stop recipe execution for strings that don't match the specified pattern.
90
+ - **replace** (string, string) - substitute one block of text with another.
91
+ - **select_matching** (regexp) - continue recipe execution only for those strings that match the specified pattern.
92
+ - **split** (separator) - split string into multiple parts using specified _separator_.
93
+
94
+ ### HTML / XML manipulations
95
+
96
+ - **convert_to_markdown** - return tag contents converted to markdown.
97
+ - **find** (xpath) - return nodes matching specified XPath or CSS selector.
98
+ - **find_first** (xpath) - return the first node matching specified XPath or CSS selector.
99
+ - **get_attribute** (name) - returns value of specified attribute.
100
+ - **get_contents** - returns tag contents including any child tags (similar to inner_html).
101
+ - **get_text** - returns text contents of a tag, skipping any tags (similar to inner_text).
102
+ - **remove_tags** (name) - removes child tags including content.
103
+ - **skip_tags** (name) - removes child tags preserving their content.
104
+
105
+ ### Special-purpose actions
106
+
107
+ - **set_input** (string) - sets input for the following action.
108
+ - **create_atom** - create an Atom from saved variables.
109
+ - **run_recipe_and_save** (variable, recipe) - run another recipe and save its result as a variable.
110
+ - **run_recipe** (recipe) - run another recipe and use it's output as input for the next action.
111
+ - **save** (variable) - save input as a variable.
112
+
113
+ # Contributing
114
+
115
+ - Find something you would like to work on.
116
+ - Fork the project and do your work in a topic branch.
117
+ - Make sure your changes will work on both Ruby 1.8.7 and Ruby 1.9.
118
+ - Add tests in spec/ folder for the behavior you want to test.
119
+ - Run all the tests using _rake spec_.
120
+ - Commit your changes and send a pull request.
121
+
122
+ # License
123
+
124
+ Copyright (c) 2013 Merimond Corporation. MIT license, see [LICENSE] for details.
125
+
126
+ [LICENSE]: http://github.com/merimond/saper/blob/master/LICENSE
@@ -0,0 +1,17 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ require 'rspec/core/rake_task'
4
+ require 'yard'
5
+
6
+ Bundler.require
7
+ Bundler::GemHelper.install_tasks
8
+
9
+ desc "Run specs"
10
+ RSpec::Core::RakeTask.new do |t|
11
+ # nothing
12
+ end
13
+
14
+ desc "Generate docs"
15
+ YARD::Rake::YardocTask.new do |t|
16
+ t.files = ['lib/saper/**/*.rb']
17
+ end
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "saper"
4
+ require 'trollop'
5
+
6
+ opts = Trollop::options do
7
+ banner <<-EOS
8
+ Saper web automation library (version #{Saper::VERSION})
9
+ Usage: saper <filename> [options]
10
+
11
+ Options are ...
12
+ EOS
13
+ opt :input, "Recipe input", :type => :string
14
+ opt :recipe, "Recipe name", :type => :string
15
+ end
16
+
17
+ unless ARGV.size == 1
18
+ Trollop::die "Please specify filename"
19
+ end
20
+
21
+ path = File.expand_path(ARGV.first)
22
+
23
+ unless File.exists?(path) && File.file?(path)
24
+ Trollop::die("File not found")
25
+ end
26
+
27
+ begin
28
+ data = File.read(path)
29
+ rescue
30
+ Trollop::die("File cannot be read")
31
+ end
32
+
33
+ begin
34
+ space = Saper::Namespace.parse(data)
35
+ rescue NoMethodError => e
36
+ Trollop::die("Invalid command: %s" % e.message.match(/`(\w+)'/)[1])
37
+ end
38
+
39
+ if opts[:recipe].nil?
40
+ if space.run_by_default.nil?
41
+ Trollop::die("Recipe name undefined")
42
+ end
43
+ opts[:recipe] = space.run_by_default
44
+ end
45
+
46
+ begin
47
+ recipe = space[opts[:recipe]]
48
+ rescue Saper::RecipeNotFound => e
49
+ Trollop::die("`%s` recipe not found" % opts[:input])
50
+ rescue Saper::ActionNotFound => e
51
+ Trollop::die("Invalid action: %s" % e.message)
52
+ rescue Saper::InvalidArgument => e
53
+ Trolltop::die("Invalid action argument")
54
+ end
55
+
56
+ if opts[:input].nil? && recipe.input_required?
57
+ Trollop::die("No input specified, action requires %s" % recipe.input_required.inspect)
58
+ end
59
+
60
+ puts recipe.run(opts[:input]).results
@@ -0,0 +1,54 @@
1
+ # XPath-style library for JSON
2
+
3
+ module JSONSearch
4
+
5
+ def self.find(object, selector)
6
+ if object.is_a?(Array)
7
+ return object.map { |item|
8
+ find(item, selector)
9
+ }.flatten.compact
10
+ end
11
+ case selector
12
+ when /\A\/([^\/]+)/
13
+ name = $1
14
+ sub = selector.sub($&, "")
15
+ children(object, name).map { |child|
16
+ find(child, sub)
17
+ }.flatten.compact
18
+ when /\A\/\/([^\/]+)/
19
+ name = $1
20
+ sub = selector.sub($&, "")
21
+ descendants(object, name).map { |descendant|
22
+ find(descendant, sub)
23
+ }.flatten.compact
24
+ when /\A\z/
25
+ object
26
+ else
27
+ raise "Unsupported selector"
28
+ end
29
+ end
30
+
31
+ def self.children(object, name)
32
+ case object
33
+ when Array
34
+ object.map { |item|
35
+ children(item, name)
36
+ }.flatten
37
+ when Hash
38
+ if name == "*"
39
+ object.values
40
+ else
41
+ [object[name]]
42
+ end
43
+ else
44
+ []
45
+ end
46
+ end
47
+
48
+ def self.descendants(object, name)
49
+ children(object, "*").map { |child|
50
+ descendants(child, name)
51
+ }.push(*children(object, name))
52
+ end
53
+
54
+ end
@@ -0,0 +1,26 @@
1
+ # Monkey patching Mechanize
2
+ # - allow custom pre_connect_hooks and post_connect_hooks
3
+
4
+ class Mechanize
5
+
6
+ def pre_connect_hook(&block)
7
+ @agent.pre_connect_hook(&block)
8
+ end
9
+
10
+ def post_connect_hook(&block)
11
+ @agent.post_connect_hook(&block)
12
+ end
13
+
14
+ end
15
+
16
+ class Mechanize::HTTP::Agent
17
+
18
+ def pre_connect_hook(&block)
19
+ @pre_connect_hooks << block
20
+ end
21
+
22
+ def post_connect_hook(&block)
23
+ @post_connect_hooks << block
24
+ end
25
+
26
+ end
@@ -0,0 +1,12 @@
1
+ # Monkey patching Nokogiri:
2
+ # - correctly handle &#151; character
3
+
4
+ module Nokogiri
5
+ module HTML
6
+
7
+ def self.parse(thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
8
+ Document.parse(thing.gsub("&#151;", "—"), url, encoding, options, &block)
9
+ end
10
+
11
+ end
12
+ end
@@ -0,0 +1,37 @@
1
+ module Saper
2
+
3
+ def self.require_multiple(dir)
4
+ Dir[File.dirname(__FILE__) + "/" + dir].each { |file| require file }
5
+ end
6
+
7
+ require 'reverse_markdown'
8
+ require 'json'
9
+ require 'vremya'
10
+ require 'nokogiri'
11
+ require 'securerandom'
12
+ require 'mechanize'
13
+
14
+ require_relative "lib/json_search"
15
+ require_relative "lib/mechanize"
16
+ require_relative "lib/nokogiri"
17
+
18
+ require_relative "saper/version"
19
+ require_relative "saper/version"
20
+ require_relative "saper/core/item"
21
+ require_multiple "saper/items/*.rb"
22
+ require_relative "saper/core/argument"
23
+ require_multiple "saper/arguments/*.rb"
24
+ require_relative "saper/core/action"
25
+ require_multiple "saper/actions/*.rb"
26
+ require_relative "saper/core/error"
27
+ require_relative "saper/core/namespace"
28
+ require_relative "saper/core/type"
29
+ require_relative "saper/core/recipe"
30
+ require_relative "saper/core/error"
31
+ require_relative "saper/core/keychain"
32
+ require_relative "saper/core/dsl"
33
+ require_relative "saper/core/browser"
34
+ require_relative "saper/core/logger"
35
+ require_relative "saper/core/runtime"
36
+
37
+ end
@@ -0,0 +1,14 @@
1
+ module Saper
2
+ module Actions
3
+ class AppendWith < Action
4
+
5
+ argument :text
6
+ accepts :text, :returns => :text
7
+
8
+ run do |input, string|
9
+ "%s%s" % [input, string]
10
+ end
11
+
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ module Saper
2
+ module Actions
3
+ class ConvertToHTML < Action
4
+
5
+ accepts :text, :returns => :html
6
+ accepts :document, :returns => :html
7
+
8
+ run do |input|
9
+ input.to_html
10
+ end
11
+
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ module Saper
2
+ module Actions
3
+ class ConvertToJSON < Action
4
+
5
+ accepts :text, :returns => :json
6
+ accepts :document, :returns => :json
7
+
8
+ run do |input|
9
+ input.to_json
10
+ end
11
+
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,13 @@
1
+ module Saper
2
+ module Actions
3
+ class ConvertToMarkdown < Action
4
+
5
+ accepts :html, :returns => :markdown
6
+
7
+ run do |input|
8
+ input.to_markdown
9
+ end
10
+
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,15 @@
1
+ module Saper
2
+ module Actions
3
+ class ConvertToTime < Action
4
+
5
+ argument :text
6
+ argument :text, :optional => true
7
+ accepts :text, :returns => :time
8
+
9
+ run do |input, format, tz|
10
+ input.to_time(format, tz)
11
+ end
12
+
13
+ end
14
+ end
15
+ end