rusty 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +44 -0
- data/LICENSE.BSD +26 -0
- data/README.md +200 -0
- data/Rakefile +13 -0
- data/bin/watchr +27 -0
- data/lib/rusty.rb +16 -0
- data/lib/rusty/callback_binding.rb +71 -0
- data/lib/rusty/dx.rb +127 -0
- data/lib/rusty/helpers.rb +40 -0
- data/lib/rusty/nokogiri_ext.rb +101 -0
- data/lib/rusty/rule_set.rb +124 -0
- data/lib/rusty/scope.rb +29 -0
- data/lib/rusty/selector.rb +87 -0
- data/lib/rusty/version.rb +8 -0
- data/rusty.gemspec +24 -0
- data/tasks/rdoc.rake +18 -0
- data/tasks/test.rake +8 -0
- data/test/helper.rb +17 -0
- data/test/test_dx.rb +62 -0
- data/test/test_helper.rb +39 -0
- data/test/test_nokogiri_ext.rb +73 -0
- data/test/test_rss_example.rb +58 -0
- data/test/test_rule_set.rb +65 -0
- data/test/test_scope.rb +97 -0
- data/test/test_selector.rb +48 -0
- metadata +103 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
# This file is part of the rusty ruby gem.
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 @radiospiel
|
4
|
+
# Distributed under the terms of the modified BSD license, see LICENSE.BSD
|
5
|
+
|
6
|
+
# Helper support for Rusty.
|
7
|
+
module Rusty::Helpers
|
8
|
+
# return all helpers
|
9
|
+
def helpers
|
10
|
+
@helpers ||= []
|
11
|
+
end
|
12
|
+
|
13
|
+
# set up a helper. Examples:
|
14
|
+
#
|
15
|
+
# module MyParser
|
16
|
+
# extend Rusty::RuleSet
|
17
|
+
#
|
18
|
+
# helper Rusty::Helpers::Text
|
19
|
+
#
|
20
|
+
# helper do
|
21
|
+
# def foo
|
22
|
+
# "bar"
|
23
|
+
# end
|
24
|
+
# end
|
25
|
+
# end
|
26
|
+
#
|
27
|
+
def helper(*mods, &block)
|
28
|
+
helpers.concat mods
|
29
|
+
helpers << Module.new.tap { |mod| mod.class_eval(&block) } if block
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Some Text helpers.
|
34
|
+
module Rusty::Helpers::Text
|
35
|
+
#
|
36
|
+
# Returns a cleaned version of a node's text.
|
37
|
+
def text(node)
|
38
|
+
node.text.gsub(/\u200e/, "").strip
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# This file is part of the rusty ruby gem.
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 @radiospiel
|
4
|
+
# Distributed under the terms of the modified BSD license, see LICENSE.BSD
|
5
|
+
|
6
|
+
# Nokogiri extensions as used by rusty
|
7
|
+
|
8
|
+
# ---------------------------------------------------------------------
|
9
|
+
|
10
|
+
class Nokogiri::XML::Node
|
11
|
+
# returns a nodes attributes as a hash
|
12
|
+
def attributes_hash
|
13
|
+
attributes.inject({}) do |hash, (name, attr)|
|
14
|
+
hash.update name => attr.value
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# returns an array of classes as Strings
|
19
|
+
def classes
|
20
|
+
return [] unless classes = self["class"]
|
21
|
+
classes.strip.split(/\s+/)
|
22
|
+
end
|
23
|
+
|
24
|
+
# does this node has a class with a given name?
|
25
|
+
def has_class?(name)
|
26
|
+
@class_syms ||= classes.map(&:to_sym)
|
27
|
+
@class_syms.include?(name.to_sym)
|
28
|
+
end
|
29
|
+
|
30
|
+
def parents
|
31
|
+
return [] if parent == document
|
32
|
+
self_and_parents(parent)
|
33
|
+
end
|
34
|
+
|
35
|
+
# return a list of all parent nodes, up until and excluding the document node.
|
36
|
+
def self_and_parents(node=self)
|
37
|
+
[ ].tap do |parents|
|
38
|
+
while node.parent != node.document
|
39
|
+
parents.unshift(node)
|
40
|
+
node = node.parent
|
41
|
+
end
|
42
|
+
parents.unshift node
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# returns the debug node name for this node; which is a simplified CSS node name
|
47
|
+
# name{#id}{.class}{.class}
|
48
|
+
def simplified_name
|
49
|
+
simplified_name = name
|
50
|
+
if id = self["id"]
|
51
|
+
simplified_name += "##{id}"
|
52
|
+
end
|
53
|
+
classes.each do |klass|
|
54
|
+
simplified_name += ".#{klass}"
|
55
|
+
end
|
56
|
+
simplified_name.gsub(/^div([\.\#])/, "\\1")
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
class Nokogiri::HTML::Document
|
61
|
+
# returns the encoding as defined in the meta[http-equiv=content-type]
|
62
|
+
# node. Available only in HTML documents.
|
63
|
+
def meta_encoding
|
64
|
+
# HTML5
|
65
|
+
css("meta[charset]").each do |meta|
|
66
|
+
next unless charset = meta.attribute("charset")
|
67
|
+
|
68
|
+
return charset.value
|
69
|
+
end
|
70
|
+
|
71
|
+
# HTML4
|
72
|
+
css("meta[http-equiv=content-type]").each do |meta|
|
73
|
+
next unless content = meta.attribute("content")
|
74
|
+
next unless content.value.split("; ").last =~ /^charset=(.*)/
|
75
|
+
|
76
|
+
return $1
|
77
|
+
end
|
78
|
+
|
79
|
+
nil
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
module Nokogiri::HTML
|
84
|
+
# loads a document from \a data. If the encoding as determined by Nokogiri
|
85
|
+
# does not match the meta_encoding, tries to reload the data with that
|
86
|
+
# encoding.
|
87
|
+
def self.with_meta_encoding(data)
|
88
|
+
doc = Nokogiri.HTML(data)
|
89
|
+
|
90
|
+
meta_encoding = doc.meta_encoding
|
91
|
+
return doc unless meta_encoding && doc.encoding != meta_encoding
|
92
|
+
|
93
|
+
# try to reread with meta_encoding
|
94
|
+
doc2 = Nokogiri.HTML(data, nil, meta_encoding)
|
95
|
+
return doc2 if doc2.encoding == meta_encoding
|
96
|
+
|
97
|
+
# rereading failed, return original document
|
98
|
+
doc
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# This file is part of the rusty ruby gem.
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 @radiospiel
|
4
|
+
# Distributed under the terms of the modified BSD license, see LICENSE.BSD
|
5
|
+
|
6
|
+
module Rusty::RuleSet
|
7
|
+
include Rusty::Helpers
|
8
|
+
|
9
|
+
# A rule combines a selector with a proc.
|
10
|
+
class Rule < Struct.new(:selector, :proc)
|
11
|
+
end
|
12
|
+
|
13
|
+
# record a rule for any of these selectors.
|
14
|
+
#
|
15
|
+
# This rule will be activated when a node gets processed.
|
16
|
+
def on(*selectors, &block)
|
17
|
+
register_rule(:on, *selectors, &block)
|
18
|
+
end
|
19
|
+
|
20
|
+
# record a rule for any of these selectors.
|
21
|
+
#
|
22
|
+
# This rule will be activated when a node's processing is done.
|
23
|
+
#
|
24
|
+
# Note: The after method is similar to
|
25
|
+
#
|
26
|
+
# on "selector" do
|
27
|
+
# callback do
|
28
|
+
# do_something
|
29
|
+
# end
|
30
|
+
# end
|
31
|
+
def after(*selectors, &block)
|
32
|
+
register_rule(:after, *selectors, &block)
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
# Return a hash of rules in a given mode.
|
38
|
+
def rules_for_mode(mode)
|
39
|
+
@rules ||= {}
|
40
|
+
@rules[mode] ||= {}
|
41
|
+
end
|
42
|
+
|
43
|
+
# Register a rule for a number of selectors in a given mode.
|
44
|
+
# Mode should be :on or :after
|
45
|
+
def register_rule(mode, *selectors, &block)
|
46
|
+
rules = rules_for_mode(mode)
|
47
|
+
|
48
|
+
selectors.
|
49
|
+
map { |selector| selector.split(",").map(&:strip) }.
|
50
|
+
flatten.
|
51
|
+
each { |selector|
|
52
|
+
STDERR.puts "#{name}, in mode :#{mode}: redefining rule for #{selector}" if rules[selector]
|
53
|
+
rules[selector] = Rule.new(Rusty::Selector.new(selector), block)
|
54
|
+
}
|
55
|
+
end
|
56
|
+
|
57
|
+
public
|
58
|
+
|
59
|
+
# return the best matching rule for a given node
|
60
|
+
# Mode should be :on or :after
|
61
|
+
def best_rule(mode, node)
|
62
|
+
rules_for_mode(mode).values.
|
63
|
+
select { |rule| rule.selector.match?(node) }.
|
64
|
+
sort_by { |rule| rule.selector.weight }.
|
65
|
+
last
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
# returns the class for event scopes in this RuleSet. This is a subclass of
|
71
|
+
# Rusty::CallbackBinding, which is named after the current modules name (i.e. if
|
72
|
+
# RuleSet is extended into a Module Foo, the subclass will be named Rusty::
|
73
|
+
# CallbackBinding::Foo) and which has all helpers correctly loaded.
|
74
|
+
|
75
|
+
def callback_binding_klass
|
76
|
+
@callback_binding_klass ||= Rusty::CallbackBinding.subclass_with_name_and_helpers name, *helpers
|
77
|
+
end
|
78
|
+
|
79
|
+
public
|
80
|
+
|
81
|
+
# transform a node, and return transformed data.
|
82
|
+
def transform!(node, scope = nil)
|
83
|
+
if node.is_a?(Nokogiri::XML::Document)
|
84
|
+
node = node.root
|
85
|
+
end
|
86
|
+
|
87
|
+
scope ||= Rusty::Scope.new(node)
|
88
|
+
|
89
|
+
# The callback scope for this node.
|
90
|
+
callback_binding = callback_binding_klass.new(scope)
|
91
|
+
|
92
|
+
has_rule = false
|
93
|
+
|
94
|
+
[ :on, :after ].each do |mode|
|
95
|
+
# find explicit rule for this node. Warn if there is none.
|
96
|
+
if rule = best_rule(mode, node)
|
97
|
+
has_rule = true
|
98
|
+
callback_binding.instance_eval(&rule.proc)
|
99
|
+
end
|
100
|
+
|
101
|
+
# in :on mode: process children, unless explicitely skipped.
|
102
|
+
if mode == :on && !callback_binding.skip?
|
103
|
+
node.children.each do |child|
|
104
|
+
next if child.text? || child.cdata?
|
105
|
+
next if child.comment?
|
106
|
+
|
107
|
+
transform! child, Rusty::Scope.new(child, scope)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# run callback
|
112
|
+
if callback = callback_binding.callback
|
113
|
+
callback_binding.instance_eval(&callback)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
unless has_rule
|
118
|
+
path = node.self_and_parents.map(&:simplified_name).join(" > ")
|
119
|
+
STDERR.puts "no rule registered: #{path}"
|
120
|
+
end
|
121
|
+
|
122
|
+
scope
|
123
|
+
end
|
124
|
+
end
|
data/lib/rusty/scope.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# This file is part of the rusty ruby gem.
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 @radiospiel
|
4
|
+
# Distributed under the terms of the modified BSD license, see LICENSE.BSD
|
5
|
+
|
6
|
+
#
|
7
|
+
# A Rusty output scope, is related to a input node,
|
8
|
+
# and might or might not have a parent.
|
9
|
+
class Rusty::Scope < Rusty::DX
|
10
|
+
attr :node, true
|
11
|
+
private :node=
|
12
|
+
|
13
|
+
def initialize(node, parent=nil)
|
14
|
+
@node, @parent = node, parent
|
15
|
+
end
|
16
|
+
|
17
|
+
# Does this scope matches a given name?
|
18
|
+
def has_name?(name)
|
19
|
+
return @parent.nil? if name == "document"
|
20
|
+
|
21
|
+
node.name == name || node.has_class?(name)
|
22
|
+
end
|
23
|
+
|
24
|
+
# yields all nodes starting at self up to the top.
|
25
|
+
def up!(&block)
|
26
|
+
yield(self)
|
27
|
+
@parent.up!(&block) if @parent
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# This file is part of the rusty ruby gem.
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 @radiospiel
|
4
|
+
# Distributed under the terms of the modified BSD license, see LICENSE.BSD
|
5
|
+
|
6
|
+
# -- selector engines ---------------------------------------------------------
|
7
|
+
|
8
|
+
# A selector is an object which is created based on a selector string, and
|
9
|
+
# which implements the `weight` and `match?` methods.
|
10
|
+
module Rusty::Selector
|
11
|
+
# A simple, nokogiri based CSS matcher.
|
12
|
+
class CSS
|
13
|
+
attr :matcher, :name
|
14
|
+
|
15
|
+
# Create selector
|
16
|
+
def initialize(selector)
|
17
|
+
@name = @selector = selector
|
18
|
+
|
19
|
+
# == special case: "*"
|
20
|
+
#
|
21
|
+
# The "*" selector matches all nodes, and Nokogiri::XML::Node#css returns
|
22
|
+
# a huge array of nodes, which R::S::CSS#match? would have to walk through.
|
23
|
+
# Implementing that special case speeds up things by ~10% in the google
|
24
|
+
# example, and reduces memory load.
|
25
|
+
#
|
26
|
+
# Note: by defining it directly on <self> this special case implementation
|
27
|
+
# also overrides match? methods defined in subclasses.
|
28
|
+
if @selector == "*"
|
29
|
+
def self.match?(node); !node.nil?; end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# The weight of the selector; is close to, but not exactly as
|
34
|
+
# CSS's weight definition.
|
35
|
+
def weight
|
36
|
+
@weight ||= @selector.split(/\s+/).inject(0) do |weight, part|
|
37
|
+
weight += case part
|
38
|
+
when /#/ then 1_000_000 # part with an ID, much much weight
|
39
|
+
when /\./ then 1_000 # selector with a class
|
40
|
+
when /^[a-zA-Z_]/ then 1_000 # node name
|
41
|
+
else 1
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Does this selector matches a specific node?
|
47
|
+
def match?(node)
|
48
|
+
return false unless node
|
49
|
+
|
50
|
+
node.document.css(@selector).include?(node)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# A cached CSS selector, caches matching nodes within a document.
|
55
|
+
class CachedCSS < CSS
|
56
|
+
# Does this selector matches a specific node?
|
57
|
+
def match?(node)
|
58
|
+
return false unless node
|
59
|
+
|
60
|
+
cache_document(node.document)
|
61
|
+
@matching_nodes.include?(node)
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def cache_document(document)
|
68
|
+
return if @cached_document && @cached_document == document
|
69
|
+
|
70
|
+
@cached_document = document
|
71
|
+
@matching_nodes = document.css(@selector)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# You probably want cached selectors, especially when working with
|
76
|
+
# larger documents. If these eat to much memory, try to use
|
77
|
+
#
|
78
|
+
# DEFAULT_SELECTOR = Rusty::Selector::CSS
|
79
|
+
#
|
80
|
+
# but expect exploding runtimes: this increases O(m+n) -> O(m*n).
|
81
|
+
DEFAULT_SELECTOR = Rusty::Selector::CachedCSS
|
82
|
+
|
83
|
+
# Create a Selector object for a given `selector` string.
|
84
|
+
def self.new(selector)
|
85
|
+
DEFAULT_SELECTOR.new selector
|
86
|
+
end
|
87
|
+
end
|
data/rusty.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# This file is part of the rusty ruby gem.
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 @radiospiel
|
4
|
+
# Distributed under the terms of the modified BSD license, see LICENSE.BSD
|
5
|
+
require "#{File.dirname(__FILE__)}/lib/rusty/version.rb"
|
6
|
+
|
7
|
+
Gem::Specification.new do |gem|
|
8
|
+
gem.name = "rusty"
|
9
|
+
gem.version = Rusty::VERSION
|
10
|
+
|
11
|
+
gem.authors = ["radiospiel"]
|
12
|
+
gem.email = ["eno@radiospiel.org"]
|
13
|
+
gem.homepage = "http://github.com/radiospiel/rusty"
|
14
|
+
gem.summary = "XML parsing without the hassle."
|
15
|
+
|
16
|
+
gem.description = gem.summary
|
17
|
+
|
18
|
+
gem.add_dependency "nokogiri"
|
19
|
+
|
20
|
+
gem.files = `git ls-files`.split($\)
|
21
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
22
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
23
|
+
gem.require_paths = ["lib"]
|
24
|
+
end
|
data/tasks/rdoc.rake
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# This file is part of the rusty ruby gem.
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 @radiospiel
|
4
|
+
# Distributed under the terms of the modified BSD license, see LICENSE.BSD
|
5
|
+
|
6
|
+
$:.unshift File.expand_path("../lib", __FILE__)
|
7
|
+
|
8
|
+
require 'rdoc/task'
|
9
|
+
|
10
|
+
RDoc::Task.new do |rdoc|
|
11
|
+
require_relative "../lib/rusty/version"
|
12
|
+
version = Rusty::VERSION
|
13
|
+
|
14
|
+
rdoc.rdoc_dir = 'rdoc'
|
15
|
+
rdoc.title = "rusty #{version}"
|
16
|
+
rdoc.rdoc_files.include('README*')
|
17
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
18
|
+
end
|