rusty 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,40 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+
6
+ # Helper support for Rusty.
7
+ module Rusty::Helpers
8
+ # return all helpers
9
+ def helpers
10
+ @helpers ||= []
11
+ end
12
+
13
+ # set up a helper. Examples:
14
+ #
15
+ # module MyParser
16
+ # extend Rusty::RuleSet
17
+ #
18
+ # helper Rusty::Helpers::Text
19
+ #
20
+ # helper do
21
+ # def foo
22
+ # "bar"
23
+ # end
24
+ # end
25
+ # end
26
+ #
27
+ def helper(*mods, &block)
28
+ helpers.concat mods
29
+ helpers << Module.new.tap { |mod| mod.class_eval(&block) } if block
30
+ end
31
+ end
32
+
33
+ # Some Text helpers.
34
+ module Rusty::Helpers::Text
35
+ #
36
+ # Returns a cleaned version of a node's text.
37
+ def text(node)
38
+ node.text.gsub(/\u200e/, "").strip
39
+ end
40
+ end
@@ -0,0 +1,101 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+
6
+ # Nokogiri extensions as used by rusty
7
+
8
+ # ---------------------------------------------------------------------
9
+
10
+ class Nokogiri::XML::Node
11
+ # returns a nodes attributes as a hash
12
+ def attributes_hash
13
+ attributes.inject({}) do |hash, (name, attr)|
14
+ hash.update name => attr.value
15
+ end
16
+ end
17
+
18
+ # returns an array of classes as Strings
19
+ def classes
20
+ return [] unless classes = self["class"]
21
+ classes.strip.split(/\s+/)
22
+ end
23
+
24
+ # does this node has a class with a given name?
25
+ def has_class?(name)
26
+ @class_syms ||= classes.map(&:to_sym)
27
+ @class_syms.include?(name.to_sym)
28
+ end
29
+
30
+ def parents
31
+ return [] if parent == document
32
+ self_and_parents(parent)
33
+ end
34
+
35
+ # return a list of all parent nodes, up until and excluding the document node.
36
+ def self_and_parents(node=self)
37
+ [ ].tap do |parents|
38
+ while node.parent != node.document
39
+ parents.unshift(node)
40
+ node = node.parent
41
+ end
42
+ parents.unshift node
43
+ end
44
+ end
45
+
46
+ # returns the debug node name for this node; which is a simplified CSS node name
47
+ # name{#id}{.class}{.class}
48
+ def simplified_name
49
+ simplified_name = name
50
+ if id = self["id"]
51
+ simplified_name += "##{id}"
52
+ end
53
+ classes.each do |klass|
54
+ simplified_name += ".#{klass}"
55
+ end
56
+ simplified_name.gsub(/^div([\.\#])/, "\\1")
57
+ end
58
+ end
59
+
60
+ class Nokogiri::HTML::Document
61
+ # returns the encoding as defined in the meta[http-equiv=content-type]
62
+ # node. Available only in HTML documents.
63
+ def meta_encoding
64
+ # HTML5
65
+ css("meta[charset]").each do |meta|
66
+ next unless charset = meta.attribute("charset")
67
+
68
+ return charset.value
69
+ end
70
+
71
+ # HTML4
72
+ css("meta[http-equiv=content-type]").each do |meta|
73
+ next unless content = meta.attribute("content")
74
+ next unless content.value.split("; ").last =~ /^charset=(.*)/
75
+
76
+ return $1
77
+ end
78
+
79
+ nil
80
+ end
81
+ end
82
+
83
+ module Nokogiri::HTML
84
+ # loads a document from \a data. If the encoding as determined by Nokogiri
85
+ # does not match the meta_encoding, tries to reload the data with that
86
+ # encoding.
87
+ def self.with_meta_encoding(data)
88
+ doc = Nokogiri.HTML(data)
89
+
90
+ meta_encoding = doc.meta_encoding
91
+ return doc unless meta_encoding && doc.encoding != meta_encoding
92
+
93
+ # try to reread with meta_encoding
94
+ doc2 = Nokogiri.HTML(data, nil, meta_encoding)
95
+ return doc2 if doc2.encoding == meta_encoding
96
+
97
+ # rereading failed, return original document
98
+ doc
99
+ end
100
+ end
101
+
@@ -0,0 +1,124 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+
6
+ module Rusty::RuleSet
7
+ include Rusty::Helpers
8
+
9
+ # A rule combines a selector with a proc.
10
+ class Rule < Struct.new(:selector, :proc)
11
+ end
12
+
13
+ # record a rule for any of these selectors.
14
+ #
15
+ # This rule will be activated when a node gets processed.
16
+ def on(*selectors, &block)
17
+ register_rule(:on, *selectors, &block)
18
+ end
19
+
20
+ # record a rule for any of these selectors.
21
+ #
22
+ # This rule will be activated when a node's processing is done.
23
+ #
24
+ # Note: The after method is similar to
25
+ #
26
+ # on "selector" do
27
+ # callback do
28
+ # do_something
29
+ # end
30
+ # end
31
+ def after(*selectors, &block)
32
+ register_rule(:after, *selectors, &block)
33
+ end
34
+
35
+ private
36
+
37
+ # Return a hash of rules in a given mode.
38
+ def rules_for_mode(mode)
39
+ @rules ||= {}
40
+ @rules[mode] ||= {}
41
+ end
42
+
43
+ # Register a rule for a number of selectors in a given mode.
44
+ # Mode should be :on or :after
45
+ def register_rule(mode, *selectors, &block)
46
+ rules = rules_for_mode(mode)
47
+
48
+ selectors.
49
+ map { |selector| selector.split(",").map(&:strip) }.
50
+ flatten.
51
+ each { |selector|
52
+ STDERR.puts "#{name}, in mode :#{mode}: redefining rule for #{selector}" if rules[selector]
53
+ rules[selector] = Rule.new(Rusty::Selector.new(selector), block)
54
+ }
55
+ end
56
+
57
+ public
58
+
59
+ # return the best matching rule for a given node
60
+ # Mode should be :on or :after
61
+ def best_rule(mode, node)
62
+ rules_for_mode(mode).values.
63
+ select { |rule| rule.selector.match?(node) }.
64
+ sort_by { |rule| rule.selector.weight }.
65
+ last
66
+ end
67
+
68
+ private
69
+
70
+ # returns the class for event scopes in this RuleSet. This is a subclass of
71
+ # Rusty::CallbackBinding, which is named after the current modules name (i.e. if
72
+ # RuleSet is extended into a Module Foo, the subclass will be named Rusty::
73
+ # CallbackBinding::Foo) and which has all helpers correctly loaded.
74
+
75
+ def callback_binding_klass
76
+ @callback_binding_klass ||= Rusty::CallbackBinding.subclass_with_name_and_helpers name, *helpers
77
+ end
78
+
79
+ public
80
+
81
+ # transform a node, and return transformed data.
82
+ def transform!(node, scope = nil)
83
+ if node.is_a?(Nokogiri::XML::Document)
84
+ node = node.root
85
+ end
86
+
87
+ scope ||= Rusty::Scope.new(node)
88
+
89
+ # The callback scope for this node.
90
+ callback_binding = callback_binding_klass.new(scope)
91
+
92
+ has_rule = false
93
+
94
+ [ :on, :after ].each do |mode|
95
+ # find explicit rule for this node. Warn if there is none.
96
+ if rule = best_rule(mode, node)
97
+ has_rule = true
98
+ callback_binding.instance_eval(&rule.proc)
99
+ end
100
+
101
+ # in :on mode: process children, unless explicitely skipped.
102
+ if mode == :on && !callback_binding.skip?
103
+ node.children.each do |child|
104
+ next if child.text? || child.cdata?
105
+ next if child.comment?
106
+
107
+ transform! child, Rusty::Scope.new(child, scope)
108
+ end
109
+ end
110
+
111
+ # run callback
112
+ if callback = callback_binding.callback
113
+ callback_binding.instance_eval(&callback)
114
+ end
115
+ end
116
+
117
+ unless has_rule
118
+ path = node.self_and_parents.map(&:simplified_name).join(" > ")
119
+ STDERR.puts "no rule registered: #{path}"
120
+ end
121
+
122
+ scope
123
+ end
124
+ end
@@ -0,0 +1,29 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+
6
+ #
7
+ # A Rusty output scope, is related to a input node,
8
+ # and might or might not have a parent.
9
+ class Rusty::Scope < Rusty::DX
10
+ attr :node, true
11
+ private :node=
12
+
13
+ def initialize(node, parent=nil)
14
+ @node, @parent = node, parent
15
+ end
16
+
17
+ # Does this scope matches a given name?
18
+ def has_name?(name)
19
+ return @parent.nil? if name == "document"
20
+
21
+ node.name == name || node.has_class?(name)
22
+ end
23
+
24
+ # yields all nodes starting at self up to the top.
25
+ def up!(&block)
26
+ yield(self)
27
+ @parent.up!(&block) if @parent
28
+ end
29
+ end
@@ -0,0 +1,87 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+
6
+ # -- selector engines ---------------------------------------------------------
7
+
8
+ # A selector is an object which is created based on a selector string, and
9
+ # which implements the `weight` and `match?` methods.
10
+ module Rusty::Selector
11
+ # A simple, nokogiri based CSS matcher.
12
+ class CSS
13
+ attr :matcher, :name
14
+
15
+ # Create selector
16
+ def initialize(selector)
17
+ @name = @selector = selector
18
+
19
+ # == special case: "*"
20
+ #
21
+ # The "*" selector matches all nodes, and Nokogiri::XML::Node#css returns
22
+ # a huge array of nodes, which R::S::CSS#match? would have to walk through.
23
+ # Implementing that special case speeds up things by ~10% in the google
24
+ # example, and reduces memory load.
25
+ #
26
+ # Note: by defining it directly on <self> this special case implementation
27
+ # also overrides match? methods defined in subclasses.
28
+ if @selector == "*"
29
+ def self.match?(node); !node.nil?; end
30
+ end
31
+ end
32
+
33
+ # The weight of the selector; is close to, but not exactly as
34
+ # CSS's weight definition.
35
+ def weight
36
+ @weight ||= @selector.split(/\s+/).inject(0) do |weight, part|
37
+ weight += case part
38
+ when /#/ then 1_000_000 # part with an ID, much much weight
39
+ when /\./ then 1_000 # selector with a class
40
+ when /^[a-zA-Z_]/ then 1_000 # node name
41
+ else 1
42
+ end
43
+ end
44
+ end
45
+
46
+ # Does this selector matches a specific node?
47
+ def match?(node)
48
+ return false unless node
49
+
50
+ node.document.css(@selector).include?(node)
51
+ end
52
+ end
53
+
54
+ # A cached CSS selector, caches matching nodes within a document.
55
+ class CachedCSS < CSS
56
+ # Does this selector matches a specific node?
57
+ def match?(node)
58
+ return false unless node
59
+
60
+ cache_document(node.document)
61
+ @matching_nodes.include?(node)
62
+ end
63
+
64
+
65
+ private
66
+
67
+ def cache_document(document)
68
+ return if @cached_document && @cached_document == document
69
+
70
+ @cached_document = document
71
+ @matching_nodes = document.css(@selector)
72
+ end
73
+ end
74
+
75
+ # You probably want cached selectors, especially when working with
76
+ # larger documents. If these eat to much memory, try to use
77
+ #
78
+ # DEFAULT_SELECTOR = Rusty::Selector::CSS
79
+ #
80
+ # but expect exploding runtimes: this increases O(m+n) -> O(m*n).
81
+ DEFAULT_SELECTOR = Rusty::Selector::CachedCSS
82
+
83
+ # Create a Selector object for a given `selector` string.
84
+ def self.new(selector)
85
+ DEFAULT_SELECTOR.new selector
86
+ end
87
+ end
@@ -0,0 +1,8 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+
6
+ module Rusty
7
+ VERSION = "0.1"
8
+ end
@@ -0,0 +1,24 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+ require "#{File.dirname(__FILE__)}/lib/rusty/version.rb"
6
+
7
+ Gem::Specification.new do |gem|
8
+ gem.name = "rusty"
9
+ gem.version = Rusty::VERSION
10
+
11
+ gem.authors = ["radiospiel"]
12
+ gem.email = ["eno@radiospiel.org"]
13
+ gem.homepage = "http://github.com/radiospiel/rusty"
14
+ gem.summary = "XML parsing without the hassle."
15
+
16
+ gem.description = gem.summary
17
+
18
+ gem.add_dependency "nokogiri"
19
+
20
+ gem.files = `git ls-files`.split($\)
21
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
22
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
23
+ gem.require_paths = ["lib"]
24
+ end
@@ -0,0 +1,18 @@
1
+ # This file is part of the rusty ruby gem.
2
+ #
3
+ # Copyright (c) 2013 @radiospiel
4
+ # Distributed under the terms of the modified BSD license, see LICENSE.BSD
5
+
6
+ $:.unshift File.expand_path("../lib", __FILE__)
7
+
8
+ require 'rdoc/task'
9
+
10
+ RDoc::Task.new do |rdoc|
11
+ require_relative "../lib/rusty/version"
12
+ version = Rusty::VERSION
13
+
14
+ rdoc.rdoc_dir = 'rdoc'
15
+ rdoc.title = "rusty #{version}"
16
+ rdoc.rdoc_files.include('README*')
17
+ rdoc.rdoc_files.include('lib/**/*.rb')
18
+ end
@@ -0,0 +1,8 @@
1
+ require 'rake/testtask'
2
+
3
+ Rake::TestTask.new(:test) do |test|
4
+ test.libs << 'lib' << 'test'
5
+ test.pattern = 'test/**/test_*.rb'
6
+ test.verbose = true
7
+ end
8
+