rusty 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,40 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+
6
+ # Helper support for Rusty.
7
+ module Rusty::Helpers
8
+ # return all helpers
9
+ def helpers
10
+ @helpers ||= []
11
+ end
12
+
13
+ # set up a helper. Examples:
14
+ #
15
+ # module MyParser
16
+ # extend Rusty::RuleSet
17
+ #
18
+ # helper Rusty::Helpers::Text
19
+ #
20
+ # helper do
21
+ # def foo
22
+ # "bar"
23
+ # end
24
+ # end
25
+ # end
26
+ #
27
+ def helper(*mods, &block)
28
+ helpers.concat mods
29
+ helpers << Module.new.tap { |mod| mod.class_eval(&block) } if block
30
+ end
31
+ end
32
+
33
+ # Some Text helpers.
34
+ module Rusty::Helpers::Text
35
+ #
36
+ # Returns a cleaned version of a node's text.
37
+ def text(node)
38
+ node.text.gsub(/\u200e/, "").strip
39
+ end
40
+ end
@@ -0,0 +1,101 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+
6
+ # Nokogiri extensions as used by rusty
7
+
8
+ # ---------------------------------------------------------------------
9
+
10
+ class Nokogiri::XML::Node
11
+ # returns a nodes attributes as a hash
12
+ def attributes_hash
13
+ attributes.inject({}) do |hash, (name, attr)|
14
+ hash.update name => attr.value
15
+ end
16
+ end
17
+
18
+ # returns an array of classes as Strings
19
+ def classes
20
+ return [] unless classes = self["class"]
21
+ classes.strip.split(/\s+/)
22
+ end
23
+
24
+ # does this node has a class with a given name?
25
+ def has_class?(name)
26
+ @class_syms ||= classes.map(&:to_sym)
27
+ @class_syms.include?(name.to_sym)
28
+ end
29
+
30
+ def parents
31
+ return [] if parent == document
32
+ self_and_parents(parent)
33
+ end
34
+
35
+ # return a list of all parent nodes, up until and excluding the document node.
36
+ def self_and_parents(node=self)
37
+ [ ].tap do |parents|
38
+ while node.parent != node.document
39
+ parents.unshift(node)
40
+ node = node.parent
41
+ end
42
+ parents.unshift node
43
+ end
44
+ end
45
+
46
+ # returns the debug node name for this node; which is a simplified CSS node name
47
+ # name{#id}{.class}{.class}
48
+ def simplified_name
49
+ simplified_name = name
50
+ if id = self["id"]
51
+ simplified_name += "##{id}"
52
+ end
53
+ classes.each do |klass|
54
+ simplified_name += ".#{klass}"
55
+ end
56
+ simplified_name.gsub(/^div([\.\#])/, "\\1")
57
+ end
58
+ end
59
+
60
+ class Nokogiri::HTML::Document
61
+ # returns the encoding as defined in the meta[http-equiv=content-type]
62
+ # node. Available only in HTML documents.
63
+ def meta_encoding
64
+ # HTML5
65
+ css("meta[charset]").each do |meta|
66
+ next unless charset = meta.attribute("charset")
67
+
68
+ return charset.value
69
+ end
70
+
71
+ # HTML4
72
+ css("meta[http-equiv=content-type]").each do |meta|
73
+ next unless content = meta.attribute("content")
74
+ next unless content.value.split("; ").last =~ /^charset=(.*)/
75
+
76
+ return $1
77
+ end
78
+
79
+ nil
80
+ end
81
+ end
82
+
83
+ module Nokogiri::HTML
84
+ # loads a document from \a data. If the encoding as determined by Nokogiri
85
+ # does not match the meta_encoding, tries to reload the data with that
86
+ # encoding.
87
+ def self.with_meta_encoding(data)
88
+ doc = Nokogiri.HTML(data)
89
+
90
+ meta_encoding = doc.meta_encoding
91
+ return doc unless meta_encoding && doc.encoding != meta_encoding
92
+
93
+ # try to reread with meta_encoding
94
+ doc2 = Nokogiri.HTML(data, nil, meta_encoding)
95
+ return doc2 if doc2.encoding == meta_encoding
96
+
97
+ # rereading failed, return original document
98
+ doc
99
+ end
100
+ end
101
+
@@ -0,0 +1,124 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+
6
+ module Rusty::RuleSet
7
+ include Rusty::Helpers
8
+
9
+ # A rule combines a selector with a proc.
10
+ class Rule < Struct.new(:selector, :proc)
11
+ end
12
+
13
+ # record a rule for any of these selectors.
14
+ #
15
+ # This rule will be activated when a node gets processed.
16
+ def on(*selectors, &block)
17
+ register_rule(:on, *selectors, &block)
18
+ end
19
+
20
+ # record a rule for any of these selectors.
21
+ #
22
+ # This rule will be activated when a node's processing is done.
23
+ #
24
+ # Note: The after method is similar to
25
+ #
26
+ # on "selector" do
27
+ # callback do
28
+ # do_something
29
+ # end
30
+ # end
31
+ def after(*selectors, &block)
32
+ register_rule(:after, *selectors, &block)
33
+ end
34
+
35
+ private
36
+
37
+ # Return a hash of rules in a given mode.
38
+ def rules_for_mode(mode)
39
+ @rules ||= {}
40
+ @rules[mode] ||= {}
41
+ end
42
+
43
+ # Register a rule for a number of selectors in a given mode.
44
+ # Mode should be :on or :after
45
+ def register_rule(mode, *selectors, &block)
46
+ rules = rules_for_mode(mode)
47
+
48
+ selectors.
49
+ map { |selector| selector.split(",").map(&:strip) }.
50
+ flatten.
51
+ each { |selector|
52
+ STDERR.puts "#{name}, in mode :#{mode}: redefining rule for #{selector}" if rules[selector]
53
+ rules[selector] = Rule.new(Rusty::Selector.new(selector), block)
54
+ }
55
+ end
56
+
57
+ public
58
+
59
+ # return the best matching rule for a given node
60
+ # Mode should be :on or :after
61
+ def best_rule(mode, node)
62
+ rules_for_mode(mode).values.
63
+ select { |rule| rule.selector.match?(node) }.
64
+ sort_by { |rule| rule.selector.weight }.
65
+ last
66
+ end
67
+
68
+ private
69
+
70
+ # returns the class for event scopes in this RuleSet. This is a subclass of
71
+ # Rusty::CallbackBinding, which is named after the current modules name (i.e. if
72
+ # RuleSet is extended into a Module Foo, the subclass will be named Rusty::
73
+ # CallbackBinding::Foo) and which has all helpers correctly loaded.
74
+
75
+ def callback_binding_klass
76
+ @callback_binding_klass ||= Rusty::CallbackBinding.subclass_with_name_and_helpers name, *helpers
77
+ end
78
+
79
+ public
80
+
81
+ # transform a node, and return transformed data.
82
+ def transform!(node, scope = nil)
83
+ if node.is_a?(Nokogiri::XML::Document)
84
+ node = node.root
85
+ end
86
+
87
+ scope ||= Rusty::Scope.new(node)
88
+
89
+ # The callback scope for this node.
90
+ callback_binding = callback_binding_klass.new(scope)
91
+
92
+ has_rule = false
93
+
94
+ [ :on, :after ].each do |mode|
95
+ # find explicit rule for this node. Warn if there is none.
96
+ if rule = best_rule(mode, node)
97
+ has_rule = true
98
+ callback_binding.instance_eval(&rule.proc)
99
+ end
100
+
101
+ # in :on mode: process children, unless explicitely skipped.
102
+ if mode == :on && !callback_binding.skip?
103
+ node.children.each do |child|
104
+ next if child.text? || child.cdata?
105
+ next if child.comment?
106
+
107
+ transform! child, Rusty::Scope.new(child, scope)
108
+ end
109
+ end
110
+
111
+ # run callback
112
+ if callback = callback_binding.callback
113
+ callback_binding.instance_eval(&callback)
114
+ end
115
+ end
116
+
117
+ unless has_rule
118
+ path = node.self_and_parents.map(&:simplified_name).join(" > ")
119
+ STDERR.puts "no rule registered: #{path}"
120
+ end
121
+
122
+ scope
123
+ end
124
+ end
@@ -0,0 +1,29 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+
6
+ #
7
+ # A Rusty output scope, is related to a input node,
8
+ # and might or might not have a parent.
9
+ class Rusty::Scope < Rusty::DX
10
+ attr :node, true
11
+ private :node=
12
+
13
+ def initialize(node, parent=nil)
14
+ @node, @parent = node, parent
15
+ end
16
+
17
+ # Does this scope matches a given name?
18
+ def has_name?(name)
19
+ return @parent.nil? if name == "document"
20
+
21
+ node.name == name || node.has_class?(name)
22
+ end
23
+
24
+ # yields all nodes starting at self up to the top.
25
+ def up!(&block)
26
+ yield(self)
27
+ @parent.up!(&block) if @parent
28
+ end
29
+ end
@@ -0,0 +1,87 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+
6
+ # -- selector engines ---------------------------------------------------------
7
+
8
+ # A selector is an object which is created based on a selector string, and
9
+ # which implements the `weight` and `match?` methods.
10
+ module Rusty::Selector
11
+ # A simple, nokogiri based CSS matcher.
12
+ class CSS
13
+ attr :matcher, :name
14
+
15
+ # Create selector
16
+ def initialize(selector)
17
+ @name = @selector = selector
18
+
19
+ # == special case: "*"
20
+ #
21
+ # The "*" selector matches all nodes, and Nokogiri::XML::Node#css returns
22
+ # a huge array of nodes, which R::S::CSS#match? would have to walk through.
23
+ # Implementing that special case speeds up things by ~10% in the google
24
+ # example, and reduces memory load.
25
+ #
26
+ # Note: by defining it directly on <self> this special case implementation
27
+ # also overrides match? methods defined in subclasses.
28
+ if @selector == "*"
29
+ def self.match?(node); !node.nil?; end
30
+ end
31
+ end
32
+
33
+ # The weight of the selector; is close to, but not exactly as
34
+ # CSS's weight definition.
35
+ def weight
36
+ @weight ||= @selector.split(/\s+/).inject(0) do |weight, part|
37
+ weight += case part
38
+ when /#/ then 1_000_000 # part with an ID, much much weight
39
+ when /\./ then 1_000 # selector with a class
40
+ when /^[a-zA-Z_]/ then 1_000 # node name
41
+ else 1
42
+ end
43
+ end
44
+ end
45
+
46
+ # Does this selector matches a specific node?
47
+ def match?(node)
48
+ return false unless node
49
+
50
+ node.document.css(@selector).include?(node)
51
+ end
52
+ end
53
+
54
+ # A cached CSS selector, caches matching nodes within a document.
55
+ class CachedCSS < CSS
56
+ # Does this selector matches a specific node?
57
+ def match?(node)
58
+ return false unless node
59
+
60
+ cache_document(node.document)
61
+ @matching_nodes.include?(node)
62
+ end
63
+
64
+
65
+ private
66
+
67
+ def cache_document(document)
68
+ return if @cached_document && @cached_document == document
69
+
70
+ @cached_document = document
71
+ @matching_nodes = document.css(@selector)
72
+ end
73
+ end
74
+
75
+ # You probably want cached selectors, especially when working with
76
+ # larger documents. If these eat to much memory, try to use
77
+ #
78
+ # DEFAULT_SELECTOR = Rusty::Selector::CSS
79
+ #
80
+ # but expect exploding runtimes: this increases O(m+n) -> O(m*n).
81
+ DEFAULT_SELECTOR = Rusty::Selector::CachedCSS
82
+
83
+ # Create a Selector object for a given `selector` string.
84
+ def self.new(selector)
85
+ DEFAULT_SELECTOR.new selector
86
+ end
87
+ end
@@ -0,0 +1,8 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+
6
+ module Rusty
7
+ VERSION = "0.1"
8
+ end
@@ -0,0 +1,24 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+ require "#{File.dirname(__FILE__)}/lib/rusty/version.rb"
6
+
7
+ Gem::Specification.new do |gem|
8
+ gem.name = "rusty"
9
+ gem.version = Rusty::VERSION
10
+
11
+ gem.authors = ["radiospiel"]
12
+ gem.email = ["eno@radiospiel.org"]
13
+ gem.homepage = "http://github.com/radiospiel/rusty"
14
+ gem.summary = "XML parsing without the hassle."
15
+
16
+ gem.description = gem.summary
17
+
18
+ gem.add_dependency "nokogiri"
19
+
20
+ gem.files = `git ls-files`.split($\)
21
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
22
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
23
+ gem.require_paths = ["lib"]
24
+ end
@@ -0,0 +1,18 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+
6
+ $:.unshift File.expand_path("../lib", __FILE__)
7
+
8
+ require 'rdoc/task'
9
+
10
+ RDoc::Task.new do |rdoc|
11
+ require_relative "../lib/rusty/version"
12
+ version = Rusty::VERSION
13
+
14
+ rdoc.rdoc_dir = 'rdoc'
15
+ rdoc.title = "rusty #{version}"
16
+ rdoc.rdoc_files.include('README*')
17
+ rdoc.rdoc_files.include('lib/**/*.rb')
18
+ end
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new(:test) do |test|
4
+ test.libs << 'lib' << 'test'
5
+ test.pattern = 'test/**/test_*.rb'
6
+ test.verbose = true
7
+ end
8
+