rusty 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +44 -0
- data/LICENSE.BSD +26 -0
- data/README.md +200 -0
- data/Rakefile +13 -0
- data/bin/watchr +27 -0
- data/lib/rusty.rb +16 -0
- data/lib/rusty/callback_binding.rb +71 -0
- data/lib/rusty/dx.rb +127 -0
- data/lib/rusty/helpers.rb +40 -0
- data/lib/rusty/nokogiri_ext.rb +101 -0
- data/lib/rusty/rule_set.rb +124 -0
- data/lib/rusty/scope.rb +29 -0
- data/lib/rusty/selector.rb +87 -0
- data/lib/rusty/version.rb +8 -0
- data/rusty.gemspec +24 -0
- data/tasks/rdoc.rake +18 -0
- data/tasks/test.rake +8 -0
- data/test/helper.rb +17 -0
- data/test/test_dx.rb +62 -0
- data/test/test_helper.rb +39 -0
- data/test/test_nokogiri_ext.rb +73 -0
- data/test/test_rss_example.rb +58 -0
- data/test/test_rule_set.rb +65 -0
- data/test/test_scope.rb +97 -0
- data/test/test_selector.rb +48 -0
- metadata +103 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
# This file is part of the rusty ruby gem.
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 @radiospiel
|
4
|
+
# Distributed under the terms of the modified BSD license, see LICENSE.BSD
|
5
|
+
|
6
|
+
# Helper support for Rusty.
|
7
|
+
module Rusty::Helpers
|
8
|
+
# return all helpers
|
9
|
+
def helpers
|
10
|
+
@helpers ||= []
|
11
|
+
end
|
12
|
+
|
13
|
+
# set up a helper. Examples:
|
14
|
+
#
|
15
|
+
# module MyParser
|
16
|
+
# extend Rusty::RuleSet
|
17
|
+
#
|
18
|
+
# helper Rusty::Helpers::Text
|
19
|
+
#
|
20
|
+
# helper do
|
21
|
+
# def foo
|
22
|
+
# "bar"
|
23
|
+
# end
|
24
|
+
# end
|
25
|
+
# end
|
26
|
+
#
|
27
|
+
def helper(*mods, &block)
|
28
|
+
helpers.concat mods
|
29
|
+
helpers << Module.new.tap { |mod| mod.class_eval(&block) } if block
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Some Text helpers.
|
34
|
+
module Rusty::Helpers::Text
|
35
|
+
#
|
36
|
+
# Returns a cleaned version of a node's text.
|
37
|
+
def text(node)
|
38
|
+
node.text.gsub(/\u200e/, "").strip
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# This file is part of the rusty ruby gem.
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 @radiospiel
|
4
|
+
# Distributed under the terms of the modified BSD license, see LICENSE.BSD
|
5
|
+
|
6
|
+
# Nokogiri extensions as used by rusty
|
7
|
+
|
8
|
+
# ---------------------------------------------------------------------
|
9
|
+
|
10
|
+
class Nokogiri::XML::Node
|
11
|
+
# returns a nodes attributes as a hash
|
12
|
+
def attributes_hash
|
13
|
+
attributes.inject({}) do |hash, (name, attr)|
|
14
|
+
hash.update name => attr.value
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# returns an array of classes as Strings
|
19
|
+
def classes
|
20
|
+
return [] unless classes = self["class"]
|
21
|
+
classes.strip.split(/\s+/)
|
22
|
+
end
|
23
|
+
|
24
|
+
# does this node has a class with a given name?
|
25
|
+
def has_class?(name)
|
26
|
+
@class_syms ||= classes.map(&:to_sym)
|
27
|
+
@class_syms.include?(name.to_sym)
|
28
|
+
end
|
29
|
+
|
30
|
+
def parents
|
31
|
+
return [] if parent == document
|
32
|
+
self_and_parents(parent)
|
33
|
+
end
|
34
|
+
|
35
|
+
# return a list of all parent nodes, up until and excluding the document node.
|
36
|
+
def self_and_parents(node=self)
|
37
|
+
[ ].tap do |parents|
|
38
|
+
while node.parent != node.document
|
39
|
+
parents.unshift(node)
|
40
|
+
node = node.parent
|
41
|
+
end
|
42
|
+
parents.unshift node
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# returns the debug node name for this node; which is a simplified CSS node name
|
47
|
+
# name{#id}{.class}{.class}
|
48
|
+
def simplified_name
|
49
|
+
simplified_name = name
|
50
|
+
if id = self["id"]
|
51
|
+
simplified_name += "##{id}"
|
52
|
+
end
|
53
|
+
classes.each do |klass|
|
54
|
+
simplified_name += ".#{klass}"
|
55
|
+
end
|
56
|
+
simplified_name.gsub(/^div([\.\#])/, "\\1")
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
class Nokogiri::HTML::Document
|
61
|
+
# returns the encoding as defined in the meta[http-equiv=content-type]
|
62
|
+
# node. Available only in HTML documents.
|
63
|
+
def meta_encoding
|
64
|
+
# HTML5
|
65
|
+
css("meta[charset]").each do |meta|
|
66
|
+
next unless charset = meta.attribute("charset")
|
67
|
+
|
68
|
+
return charset.value
|
69
|
+
end
|
70
|
+
|
71
|
+
# HTML4
|
72
|
+
css("meta[http-equiv=content-type]").each do |meta|
|
73
|
+
next unless content = meta.attribute("content")
|
74
|
+
next unless content.value.split("; ").last =~ /^charset=(.*)/
|
75
|
+
|
76
|
+
return $1
|
77
|
+
end
|
78
|
+
|
79
|
+
nil
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
module Nokogiri::HTML
|
84
|
+
# loads a document from \a data. If the encoding as determined by Nokogiri
|
85
|
+
# does not match the meta_encoding, tries to reload the data with that
|
86
|
+
# encoding.
|
87
|
+
def self.with_meta_encoding(data)
|
88
|
+
doc = Nokogiri.HTML(data)
|
89
|
+
|
90
|
+
meta_encoding = doc.meta_encoding
|
91
|
+
return doc unless meta_encoding && doc.encoding != meta_encoding
|
92
|
+
|
93
|
+
# try to reread with meta_encoding
|
94
|
+
doc2 = Nokogiri.HTML(data, nil, meta_encoding)
|
95
|
+
return doc2 if doc2.encoding == meta_encoding
|
96
|
+
|
97
|
+
# rereading failed, return original document
|
98
|
+
doc
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# This file is part of the rusty ruby gem.
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 @radiospiel
|
4
|
+
# Distributed under the terms of the modified BSD license, see LICENSE.BSD
|
5
|
+
|
6
|
+
module Rusty::RuleSet
|
7
|
+
include Rusty::Helpers
|
8
|
+
|
9
|
+
# A rule combines a selector with a proc.
|
10
|
+
class Rule < Struct.new(:selector, :proc)
|
11
|
+
end
|
12
|
+
|
13
|
+
# record a rule for any of these selectors.
|
14
|
+
#
|
15
|
+
# This rule will be activated when a node gets processed.
|
16
|
+
def on(*selectors, &block)
|
17
|
+
register_rule(:on, *selectors, &block)
|
18
|
+
end
|
19
|
+
|
20
|
+
# record a rule for any of these selectors.
|
21
|
+
#
|
22
|
+
# This rule will be activated when a node's processing is done.
|
23
|
+
#
|
24
|
+
# Note: The after method is similar to
|
25
|
+
#
|
26
|
+
# on "selector" do
|
27
|
+
# callback do
|
28
|
+
# do_something
|
29
|
+
# end
|
30
|
+
# end
|
31
|
+
def after(*selectors, &block)
|
32
|
+
register_rule(:after, *selectors, &block)
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
# Return a hash of rules in a given mode.
|
38
|
+
def rules_for_mode(mode)
|
39
|
+
@rules ||= {}
|
40
|
+
@rules[mode] ||= {}
|
41
|
+
end
|
42
|
+
|
43
|
+
# Register a rule for a number of selectors in a given mode.
|
44
|
+
# Mode should be :on or :after
|
45
|
+
def register_rule(mode, *selectors, &block)
|
46
|
+
rules = rules_for_mode(mode)
|
47
|
+
|
48
|
+
selectors.
|
49
|
+
map { |selector| selector.split(",").map(&:strip) }.
|
50
|
+
flatten.
|
51
|
+
each { |selector|
|
52
|
+
STDERR.puts "#{name}, in mode :#{mode}: redefining rule for #{selector}" if rules[selector]
|
53
|
+
rules[selector] = Rule.new(Rusty::Selector.new(selector), block)
|
54
|
+
}
|
55
|
+
end
|
56
|
+
|
57
|
+
public
|
58
|
+
|
59
|
+
# return the best matching rule for a given node
|
60
|
+
# Mode should be :on or :after
|
61
|
+
def best_rule(mode, node)
|
62
|
+
rules_for_mode(mode).values.
|
63
|
+
select { |rule| rule.selector.match?(node) }.
|
64
|
+
sort_by { |rule| rule.selector.weight }.
|
65
|
+
last
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
# returns the class for event scopes in this RuleSet. This is a subclass of
|
71
|
+
# Rusty::CallbackBinding, which is named after the current modules name (i.e. if
|
72
|
+
# RuleSet is extended into a Module Foo, the subclass will be named Rusty::
|
73
|
+
# CallbackBinding::Foo) and which has all helpers correctly loaded.
|
74
|
+
|
75
|
+
def callback_binding_klass
|
76
|
+
@callback_binding_klass ||= Rusty::CallbackBinding.subclass_with_name_and_helpers name, *helpers
|
77
|
+
end
|
78
|
+
|
79
|
+
public
|
80
|
+
|
81
|
+
# transform a node, and return transformed data.
|
82
|
+
def transform!(node, scope = nil)
|
83
|
+
if node.is_a?(Nokogiri::XML::Document)
|
84
|
+
node = node.root
|
85
|
+
end
|
86
|
+
|
87
|
+
scope ||= Rusty::Scope.new(node)
|
88
|
+
|
89
|
+
# The callback scope for this node.
|
90
|
+
callback_binding = callback_binding_klass.new(scope)
|
91
|
+
|
92
|
+
has_rule = false
|
93
|
+
|
94
|
+
[ :on, :after ].each do |mode|
|
95
|
+
# find explicit rule for this node. Warn if there is none.
|
96
|
+
if rule = best_rule(mode, node)
|
97
|
+
has_rule = true
|
98
|
+
callback_binding.instance_eval(&rule.proc)
|
99
|
+
end
|
100
|
+
|
101
|
+
# in :on mode: process children, unless explicitely skipped.
|
102
|
+
if mode == :on && !callback_binding.skip?
|
103
|
+
node.children.each do |child|
|
104
|
+
next if child.text? || child.cdata?
|
105
|
+
next if child.comment?
|
106
|
+
|
107
|
+
transform! child, Rusty::Scope.new(child, scope)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# run callback
|
112
|
+
if callback = callback_binding.callback
|
113
|
+
callback_binding.instance_eval(&callback)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
unless has_rule
|
118
|
+
path = node.self_and_parents.map(&:simplified_name).join(" > ")
|
119
|
+
STDERR.puts "no rule registered: #{path}"
|
120
|
+
end
|
121
|
+
|
122
|
+
scope
|
123
|
+
end
|
124
|
+
end
|
data/lib/rusty/scope.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# This file is part of the rusty ruby gem.
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 @radiospiel
|
4
|
+
# Distributed under the terms of the modified BSD license, see LICENSE.BSD
|
5
|
+
|
6
|
+
#
|
7
|
+
# A Rusty output scope, is related to a input node,
|
8
|
+
# and might or might not have a parent.
|
9
|
+
class Rusty::Scope < Rusty::DX
|
10
|
+
attr :node, true
|
11
|
+
private :node=
|
12
|
+
|
13
|
+
def initialize(node, parent=nil)
|
14
|
+
@node, @parent = node, parent
|
15
|
+
end
|
16
|
+
|
17
|
+
# Does this scope matches a given name?
|
18
|
+
def has_name?(name)
|
19
|
+
return @parent.nil? if name == "document"
|
20
|
+
|
21
|
+
node.name == name || node.has_class?(name)
|
22
|
+
end
|
23
|
+
|
24
|
+
# yields all nodes starting at self up to the top.
|
25
|
+
def up!(&block)
|
26
|
+
yield(self)
|
27
|
+
@parent.up!(&block) if @parent
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# This file is part of the rusty ruby gem.
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 @radiospiel
|
4
|
+
# Distributed under the terms of the modified BSD license, see LICENSE.BSD
|
5
|
+
|
6
|
+
# -- selector engines ---------------------------------------------------------
|
7
|
+
|
8
|
+
# A selector is an object which is created based on a selector string, and
|
9
|
+
# which implements the `weight` and `match?` methods.
|
10
|
+
module Rusty::Selector
|
11
|
+
# A simple, nokogiri based CSS matcher.
|
12
|
+
class CSS
|
13
|
+
attr :matcher, :name
|
14
|
+
|
15
|
+
# Create selector
|
16
|
+
def initialize(selector)
|
17
|
+
@name = @selector = selector
|
18
|
+
|
19
|
+
# == special case: "*"
|
20
|
+
#
|
21
|
+
# The "*" selector matches all nodes, and Nokogiri::XML::Node#css returns
|
22
|
+
# a huge array of nodes, which R::S::CSS#match? would have to walk through.
|
23
|
+
# Implementing that special case speeds up things by ~10% in the google
|
24
|
+
# example, and reduces memory load.
|
25
|
+
#
|
26
|
+
# Note: by defining it directly on <self> this special case implementation
|
27
|
+
# also overrides match? methods defined in subclasses.
|
28
|
+
if @selector == "*"
|
29
|
+
def self.match?(node); !node.nil?; end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# The weight of the selector; is close to, but not exactly as
|
34
|
+
# CSS's weight definition.
|
35
|
+
def weight
|
36
|
+
@weight ||= @selector.split(/\s+/).inject(0) do |weight, part|
|
37
|
+
weight += case part
|
38
|
+
when /#/ then 1_000_000 # part with an ID, much much weight
|
39
|
+
when /\./ then 1_000 # selector with a class
|
40
|
+
when /^[a-zA-Z_]/ then 1_000 # node name
|
41
|
+
else 1
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Does this selector matches a specific node?
|
47
|
+
def match?(node)
|
48
|
+
return false unless node
|
49
|
+
|
50
|
+
node.document.css(@selector).include?(node)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# A cached CSS selector, caches matching nodes within a document.
|
55
|
+
class CachedCSS < CSS
|
56
|
+
# Does this selector matches a specific node?
|
57
|
+
def match?(node)
|
58
|
+
return false unless node
|
59
|
+
|
60
|
+
cache_document(node.document)
|
61
|
+
@matching_nodes.include?(node)
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def cache_document(document)
|
68
|
+
return if @cached_document && @cached_document == document
|
69
|
+
|
70
|
+
@cached_document = document
|
71
|
+
@matching_nodes = document.css(@selector)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# You probably want cached selectors, especially when working with
|
76
|
+
# larger documents. If these eat to much memory, try to use
|
77
|
+
#
|
78
|
+
# DEFAULT_SELECTOR = Rusty::Selector::CSS
|
79
|
+
#
|
80
|
+
# but expect exploding runtimes: this increases O(m+n) -> O(m*n).
|
81
|
+
DEFAULT_SELECTOR = Rusty::Selector::CachedCSS
|
82
|
+
|
83
|
+
# Create a Selector object for a given `selector` string.
|
84
|
+
def self.new(selector)
|
85
|
+
DEFAULT_SELECTOR.new selector
|
86
|
+
end
|
87
|
+
end
|
data/rusty.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# This file is part of the rusty ruby gem.
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 @radiospiel
|
4
|
+
# Distributed under the terms of the modified BSD license, see LICENSE.BSD
|
5
|
+
require "#{File.dirname(__FILE__)}/lib/rusty/version.rb"
|
6
|
+
|
7
|
+
Gem::Specification.new do |gem|
|
8
|
+
gem.name = "rusty"
|
9
|
+
gem.version = Rusty::VERSION
|
10
|
+
|
11
|
+
gem.authors = ["radiospiel"]
|
12
|
+
gem.email = ["eno@radiospiel.org"]
|
13
|
+
gem.homepage = "http://github.com/radiospiel/rusty"
|
14
|
+
gem.summary = "XML parsing without the hassle."
|
15
|
+
|
16
|
+
gem.description = gem.summary
|
17
|
+
|
18
|
+
gem.add_dependency "nokogiri"
|
19
|
+
|
20
|
+
gem.files = `git ls-files`.split($\)
|
21
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
22
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
23
|
+
gem.require_paths = ["lib"]
|
24
|
+
end
|
data/tasks/rdoc.rake
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# This file is part of the rusty ruby gem.
|
2
|
+
#
|
3
|
+
# Copyright (c) 2013 @radiospiel
|
4
|
+
# Distributed under the terms of the modified BSD license, see LICENSE.BSD
|
5
|
+
|
6
|
+
$:.unshift File.expand_path("../lib", __FILE__)
|
7
|
+
|
8
|
+
require 'rdoc/task'
|
9
|
+
|
10
|
+
RDoc::Task.new do |rdoc|
|
11
|
+
require_relative "../lib/rusty/version"
|
12
|
+
version = Rusty::VERSION
|
13
|
+
|
14
|
+
rdoc.rdoc_dir = 'rdoc'
|
15
|
+
rdoc.title = "rusty #{version}"
|
16
|
+
rdoc.rdoc_files.include('README*')
|
17
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
18
|
+
end
|