cocoa-xml 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +6 -0
- data/Manifest.txt +20 -0
- data/README.rdoc +105 -0
- data/Rakefile +23 -0
- data/lib/cocoa-xml.rb +48 -0
- data/lib/cocoa-xml/nodeset.rb +57 -0
- data/lib/cocoa-xml/nsxmldocument_extras.rb +48 -0
- data/lib/cocoa-xml/nsxmlnode_extras.rb +106 -0
- data/lib/cocoa-xml/version.rb +3 -0
- data/lib/nokogiri/css.rb +27 -0
- data/lib/nokogiri/css/generated_parser.rb +646 -0
- data/lib/nokogiri/css/generated_tokenizer.rb +143 -0
- data/lib/nokogiri/css/node.rb +100 -0
- data/lib/nokogiri/css/parser.rb +83 -0
- data/lib/nokogiri/css/parser.y +230 -0
- data/lib/nokogiri/css/syntax_error.rb +7 -0
- data/lib/nokogiri/css/tokenizer.rb +8 -0
- data/lib/nokogiri/css/tokenizer.rex +55 -0
- data/lib/nokogiri/css/xpath_visitor.rb +165 -0
- data/lib/nokogiri/syntax_error.rb +4 -0
- metadata +123 -0
data/History.rdoc
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
History.rdoc
|
2
|
+
Manifest.txt
|
3
|
+
README.rdoc
|
4
|
+
Rakefile
|
5
|
+
lib/cocoa-xml.rb
|
6
|
+
lib/cocoa-xml/nodeset.rb
|
7
|
+
lib/cocoa-xml/version.rb
|
8
|
+
lib/cocoa-xml/nsxmlnode_extras.rb
|
9
|
+
lib/cocoa-xml/nsxmldocument_extras.rb
|
10
|
+
lib/nokogiri/css.rb
|
11
|
+
lib/nokogiri/syntax_error.rb
|
12
|
+
lib/nokogiri/css/generated_parser.rb
|
13
|
+
lib/nokogiri/css/generated_tokenizer.rb
|
14
|
+
lib/nokogiri/css/node.rb
|
15
|
+
lib/nokogiri/css/parser.rb
|
16
|
+
lib/nokogiri/css/parser.y
|
17
|
+
lib/nokogiri/css/syntax_error.rb
|
18
|
+
lib/nokogiri/css/tokenizer.rb
|
19
|
+
lib/nokogiri/css/tokenizer.rex
|
20
|
+
lib/nokogiri/css/xpath_visitor.rb
|
data/README.rdoc
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
= Cocoa-XML
|
2
|
+
|
3
|
+
* http://github.com/cehoffman/cocoa-xml
|
4
|
+
* http://cehoffman.github.com/cocoa-xml
|
5
|
+
|
6
|
+
== DESCRIPTION:
|
7
|
+
|
8
|
+
Cocoa-xml provides a more ruby like interface to Cocoa's NSXMLDocument
|
9
|
+
and classes that inherit from NSXMLNode. It provides access to XPath,
|
10
|
+
XQuery, and CSS selectors for searching documents.
|
11
|
+
|
12
|
+
== FEATURES/PROBLEMS:
|
13
|
+
|
14
|
+
* Cocoa-xml uses native Cocoa xml processing facilities.
|
15
|
+
* CSS, XPath, and XQuery can be used to search a document
|
16
|
+
* Broken HTML documents can be processed thanks to NSXMLDocument
|
17
|
+
|
18
|
+
== SYNOPSIS:
|
19
|
+
|
20
|
+
require 'cocoa-xml'
|
21
|
+
|
22
|
+
doc = CocoaXML::HTML("http://www.google.com/search?q=cehoffman")
|
23
|
+
|
24
|
+
doc.css('h3.r a.l').each do |link|
|
25
|
+
puts link.text
|
26
|
+
end
|
27
|
+
|
28
|
+
doc.xpath('//h3/a[@class="l"]').each do |link|
|
29
|
+
puts link.text
|
30
|
+
end
|
31
|
+
|
32
|
+
doc.xquery('data(//h3/a[@class="l"]')).each do |link|
|
33
|
+
puts link
|
34
|
+
end
|
35
|
+
|
36
|
+
== REQUIREMENTS:
|
37
|
+
|
38
|
+
* MacRuby 0.6 (development version)
|
39
|
+
* OS X Snow Leopard (only version supported by MacRuby currently)
|
40
|
+
|
41
|
+
== INSTALL:
|
42
|
+
|
43
|
+
$ sudo gem install cocoa-xml
|
44
|
+
|
45
|
+
== DEVELOPERS:
|
46
|
+
|
47
|
+
It is advised to use the current development version of {MacRuby}[http://www.macruby.com] from
|
48
|
+
the source tree.
|
49
|
+
|
50
|
+
== CREDITS:
|
51
|
+
|
52
|
+
CSS selector support is taken from Nokogiri.
|
53
|
+
|
54
|
+
== LICENSE:
|
55
|
+
|
56
|
+
(The MIT License)
|
57
|
+
|
58
|
+
Copyright (c) 2010 Chris Hoffman
|
59
|
+
|
60
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
61
|
+
a copy of this software and associated documentation files (the
|
62
|
+
'Software'), to deal in the Software without restriction, including
|
63
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
64
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
65
|
+
permit persons to whom the Software is furnished to do so, subject to
|
66
|
+
the following conditions:
|
67
|
+
|
68
|
+
The above copyright notice and this permission notice shall be
|
69
|
+
included in all copies or substantial portions of the Software.
|
70
|
+
|
71
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
72
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
73
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
74
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
75
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
76
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
77
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
78
|
+
|
79
|
+
=== Nokogiri License
|
80
|
+
|
81
|
+
(The MIT License)
|
82
|
+
|
83
|
+
Copyright (c) 2008 - 2009:
|
84
|
+
|
85
|
+
* {Aaron Patterson}[http://tenderlovemaking.com]
|
86
|
+
* {Mike Dalessio}[http://mike.daless.io]
|
87
|
+
|
88
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
89
|
+
a copy of this software and associated documentation files (the
|
90
|
+
'Software'), to deal in the Software without restriction, including
|
91
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
92
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
93
|
+
permit persons to whom the Software is furnished to do so, subject to
|
94
|
+
the following conditions:
|
95
|
+
|
96
|
+
The above copyright notice and this permission notice shall be
|
97
|
+
included in all copies or substantial portions of the Software.
|
98
|
+
|
99
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
100
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
101
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
102
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
103
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
104
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
105
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'hoe'
|
5
|
+
require "./lib/cocoa-xml/version.rb"
|
6
|
+
|
7
|
+
Hoe.plugin :gemcutter
|
8
|
+
Hoe.plugin :clean
|
9
|
+
Hoe.plugin :git
|
10
|
+
Hoe.plugin :yard
|
11
|
+
|
12
|
+
Hoe.spec 'cocoa-xml' do
|
13
|
+
self.version = ::CocoaXML::Version
|
14
|
+
developer('Chris Hoffman', 'cehoffman@gmail.com')
|
15
|
+
|
16
|
+
self.rubyforge_name = 'cocoa-xml'
|
17
|
+
|
18
|
+
self.yard_title = "Cocoa-XML"
|
19
|
+
self.yard_markup = 'rdoc'
|
20
|
+
self.yard_opts = ['--no-private']
|
21
|
+
end
|
22
|
+
|
23
|
+
# vim: syntax=ruby
|
data/lib/cocoa-xml.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
framework 'Cocoa'
|
2
|
+
|
3
|
+
require "cocoa-xml/version"
|
4
|
+
require "cocoa-xml/nodeset"
|
5
|
+
require "cocoa-xml/nsxmldocument_extras"
|
6
|
+
require "cocoa-xml/nsxmlnode_extras"
|
7
|
+
require "nokogiri/css"
|
8
|
+
|
9
|
+
module CocoaXML
|
10
|
+
# Parse an input HTML source
|
11
|
+
#
|
12
|
+
# @param [url, NSURL, #read, #to_str] source a url as a string or NSURL,
|
13
|
+
# object that responds to #read, or #to_str
|
14
|
+
# @return [NSXMLDocument] An NSXMLDocument set to interpret source as HTML
|
15
|
+
def self.HTML(source)
|
16
|
+
parse source, NSXMLDocumentTidyHTML
|
17
|
+
end
|
18
|
+
|
19
|
+
# Parse an input XML source
|
20
|
+
#
|
21
|
+
# @param [url, NSURL, #read, #to_str] source a url as a string or NSURL,
|
22
|
+
# object that responds to #read, or #to_str
|
23
|
+
# @return [NSXMLDocument] An NSXMLDocument set to inperpret source as XML
|
24
|
+
def self.XML(source)
|
25
|
+
parse source, NSXMLDocumentTidyXML
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
# Parse an HTML or XML source
|
30
|
+
#
|
31
|
+
# @param [url, NSURL, #read, #to_str] source a url as a string or NSURL,
|
32
|
+
# object that responds to #read, or #to_str
|
33
|
+
# @param [Number] Constant determing how to interpret input source, xml or
|
34
|
+
# html
|
35
|
+
# @return [NSXMLDocument]
|
36
|
+
def self.parse(source, type)
|
37
|
+
error = Pointer.new :object
|
38
|
+
|
39
|
+
url = (source.is_a?(NSURL) && source) || NSURL.URLWithString(source.to_str) if source.respond_to?(:to_str)
|
40
|
+
source = source.read if source.respond_to?(:read)
|
41
|
+
|
42
|
+
unless url.nil?
|
43
|
+
::NSXMLDocument.alloc.initWithContentsOfURL url, options: type, error: error
|
44
|
+
else
|
45
|
+
::NSXMLDocument.alloc.initWithXMLString source, options: type, error: error
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module CocoaXML
|
2
|
+
class NodeSet < Array
|
3
|
+
# Collect all the texts of nodes in set and join together
|
4
|
+
#
|
5
|
+
# @return [String] single string containing text of each element
|
6
|
+
def text
|
7
|
+
collect { |node| (node.respond_to?(:to_str) && node.to_str) || node.text }.flatten.join
|
8
|
+
end
|
9
|
+
alias :inner_text :text
|
10
|
+
|
11
|
+
# Collect all the #to_s representations of elements in array
|
12
|
+
#
|
13
|
+
# @return [String] single string containing #to_s of each element
|
14
|
+
def to_s
|
15
|
+
collect { |node| node.to_s }.join
|
16
|
+
end
|
17
|
+
|
18
|
+
# Perform selector on each element in set
|
19
|
+
#
|
20
|
+
# @param [String] selector css selector to use on each node
|
21
|
+
# @return [NodeSet<NSXMLNode, String>] new set resulting from
|
22
|
+
# performing selector on each node in set
|
23
|
+
# @todo This will bomb if the node has String elements in it
|
24
|
+
def css(selector)
|
25
|
+
xpath ::Nokogiri::CSS::xpath_for(selector, :prefix => ".//").join
|
26
|
+
end
|
27
|
+
|
28
|
+
# Perform XQuery on each node in set
|
29
|
+
#
|
30
|
+
# @param [String] query xquery to perform on each node in set
|
31
|
+
# @return [NodeSet<NSXMLNode, String>] results of performing
|
32
|
+
# query
|
33
|
+
# @todo This will bomb if the node has String elements in it
|
34
|
+
def xquery(query)
|
35
|
+
query.sub! %r{^//}, '' # Root searches to start from nodes
|
36
|
+
NodeSet.new(collect { |node| node.xquery(query) }.flatten)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Perform XPath selection on each node in set
|
40
|
+
#
|
41
|
+
# @param [String] path xpath to follow on each node
|
42
|
+
# @return [NodeSet<NSXMLNode>] results of path on each node
|
43
|
+
# @todo This will bomb if the node has String elements in it
|
44
|
+
def xpath(query)
|
45
|
+
query.sub! %r{^//}, '' # Roots searches to start from nodes
|
46
|
+
NodeSet.new(collect { |node| node.xpath(query) }.flatten)
|
47
|
+
end
|
48
|
+
|
49
|
+
# Get the value of the attribute for each node in set
|
50
|
+
#
|
51
|
+
# @param [String] attr attribute to search for on each node
|
52
|
+
# @return [Array<String, nil>] value of attribute for each node
|
53
|
+
def [](attr)
|
54
|
+
collect { |node| node[attr] }.flatten
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module CocoaXML
|
2
|
+
module NSXMLDocumentExtras
|
3
|
+
# @private
|
4
|
+
def self.included(klass)
|
5
|
+
klass.class_eval do
|
6
|
+
alias :encoding :characterEncoding
|
7
|
+
alias :encoding= :setCharacterEncoding
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# Encoding of document
|
12
|
+
#
|
13
|
+
# @see http://www.iana.org/assignments/character-sets Valid Encoding Specifiers
|
14
|
+
#
|
15
|
+
# @return [NSString] encoding of document
|
16
|
+
def encoding
|
17
|
+
# Implemented as alias to characterEncoding
|
18
|
+
end
|
19
|
+
|
20
|
+
# Set encoding of document
|
21
|
+
#
|
22
|
+
# @see file:///Developer/Documentation/DocSets/com.apple.adc.documentation.AppleSnowLeopard.CoreReference.docset/Contents/Resources/Documents/documentation/Cocoa/Reference/Foundation/Classes/NSXMLDocument_Class/Reference/Reference.html#//apple_ref/occ/instm/NSXMLDocument/setCharacterEncoding: Developer Documentation
|
23
|
+
# @see http://developer.apple.com/mac/library/documentation/Cocoa/Reference/Foundation/Classes/NSXMLDocument_Class/NSXMLDocument_Class.pdf Apple NSXMLDocument PDF
|
24
|
+
#
|
25
|
+
# @param [NSString] enc valid character encoding
|
26
|
+
# @return [void]
|
27
|
+
def encoding=(enc)
|
28
|
+
# Implemented as an alias to setCharacterEncoding
|
29
|
+
end
|
30
|
+
|
31
|
+
# Determine if output of document is treated as HTML, e.g. <br> style tags
|
32
|
+
def html?
|
33
|
+
documentContentKind == NSXMLDocumentHTMLKind
|
34
|
+
end
|
35
|
+
|
36
|
+
# Determine if output of document is treated as XHTML, e.g. <br/> style tags
|
37
|
+
def xhtml?
|
38
|
+
documentContentKind == NSXMLDocumentXHTMLKind
|
39
|
+
end
|
40
|
+
|
41
|
+
# Determine if output of document is treated as XML
|
42
|
+
def xml?
|
43
|
+
documentContentKind == NSXMLDocumentXMLKind
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
::NSXMLDocument.send :include, CocoaXML::NSXMLDocumentExtras
|
@@ -0,0 +1,106 @@
|
|
1
|
+
module CocoaXML
|
2
|
+
# These are a set of methods added onto the Cocoa NSXMLNode class and children.
|
3
|
+
module NSXMLNodeExtras
|
4
|
+
# Search from this node down using a css selector
|
5
|
+
#
|
6
|
+
# @param [String] selector selector used to select nodes from document
|
7
|
+
# @return [NodeSet<NSXMLNode>] array of nodes matched by selector
|
8
|
+
def css(selector)
|
9
|
+
xpath ::Nokogiri::CSS::xpath_for(selector, :prefix => ".//").join
|
10
|
+
end
|
11
|
+
|
12
|
+
# Search document using provided path
|
13
|
+
#
|
14
|
+
# @param [String] path path used to select nodes from document
|
15
|
+
# @return [NodeSet<NSXMLNode>] array of nodes matched by selector
|
16
|
+
def xpath(path)
|
17
|
+
error = Pointer.new(:object)
|
18
|
+
results = nodesForXPath path, error: error
|
19
|
+
|
20
|
+
return NodeSet.new(results) if error[0].nil?
|
21
|
+
end
|
22
|
+
|
23
|
+
# Process document using provided query
|
24
|
+
#
|
25
|
+
# @param [String] query query used process information in document
|
26
|
+
# @return [NodeSet<NSXMLNode, String>] results depends on
|
27
|
+
# query. Notice that unlike {#xpath} basic types can also be returned.
|
28
|
+
def xquery(query)
|
29
|
+
error = Pointer.new(:object)
|
30
|
+
results = objectsForXQuery query, error: error
|
31
|
+
|
32
|
+
return NodeSet.new(results) if error[0].nil?
|
33
|
+
|
34
|
+
#TODO Do something with the error
|
35
|
+
end
|
36
|
+
|
37
|
+
# Get the contained text from this node and children nodes
|
38
|
+
#
|
39
|
+
# @return [String] text contents of node
|
40
|
+
def text
|
41
|
+
xquery('data(.)').join
|
42
|
+
end
|
43
|
+
alias :inner_text :text
|
44
|
+
|
45
|
+
# Get the value of an attribute of node
|
46
|
+
#
|
47
|
+
# @param [String] attr attribute of node to query
|
48
|
+
# @return [String, nil] string value of attribute or nil if no attribute
|
49
|
+
def [](attr)
|
50
|
+
xquery("data(@#{attr})").pop
|
51
|
+
end
|
52
|
+
|
53
|
+
# Set the value of an attribute of node
|
54
|
+
#
|
55
|
+
# @param [String] attr attribute of node to set
|
56
|
+
# @param [#to_s] value value to set attribute to
|
57
|
+
# @return ????
|
58
|
+
# @todo Find out what this funtion will return
|
59
|
+
def []=(attr, value)
|
60
|
+
node = attributeForName(attr.to_s)
|
61
|
+
node && node.setStringValue(value) || addAttribute(::NSXMLNode.attributeWithName(attr.to_s, stringValue: value))
|
62
|
+
end
|
63
|
+
|
64
|
+
# @private
|
65
|
+
def attribute(attr)
|
66
|
+
attributeForName(attr.to_s)
|
67
|
+
end
|
68
|
+
|
69
|
+
# @return [String] the xml of this node including children nodes with proper indentation
|
70
|
+
def to_s
|
71
|
+
XMLStringWithOptions NSXMLNodePrettyPrint
|
72
|
+
end
|
73
|
+
|
74
|
+
# @private
|
75
|
+
def self.included(klass)
|
76
|
+
klass.class_eval do
|
77
|
+
alias :remove :detach
|
78
|
+
alias :unlink :detach
|
79
|
+
alias :path :XPath
|
80
|
+
|
81
|
+
# TODO: Why is this not working
|
82
|
+
alias :old_children :children
|
83
|
+
def children
|
84
|
+
NodeSet.new(old_children)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Remove this node from its parent
|
90
|
+
#
|
91
|
+
# @return [self] this node
|
92
|
+
def remove
|
93
|
+
# Implemented as an alias to detach
|
94
|
+
end
|
95
|
+
alias :unlink :remove
|
96
|
+
|
97
|
+
# An XPath formula to reach this node
|
98
|
+
#
|
99
|
+
# @return [String] XPath to this node
|
100
|
+
def path
|
101
|
+
# Implemented as an alias to :XPath
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
::NSXMLNode.send :include, CocoaXML::NSXMLNodeExtras
|
data/lib/nokogiri/css.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'nokogiri/css/node'
|
2
|
+
require 'nokogiri/css/xpath_visitor'
|
3
|
+
require 'nokogiri/css/generated_parser'
|
4
|
+
require 'nokogiri/css/generated_tokenizer'
|
5
|
+
require 'nokogiri/css/tokenizer'
|
6
|
+
require 'nokogiri/css/parser'
|
7
|
+
require 'nokogiri/css/syntax_error'
|
8
|
+
|
9
|
+
module Nokogiri
|
10
|
+
# Modules to convert CSS selectors to valid XPath
|
11
|
+
# @see http://nokogiri.org/Nokogiri/CSS.html Nokogiri Documentation
|
12
|
+
module CSS
|
13
|
+
#class << self
|
14
|
+
###
|
15
|
+
# Parse this CSS selector in +selector+. Returns an AST.
|
16
|
+
def self.parse selector
|
17
|
+
Parser.new.parse selector
|
18
|
+
end
|
19
|
+
|
20
|
+
###
|
21
|
+
# Get the XPath for +selector+.
|
22
|
+
def self.xpath_for selector, options={}
|
23
|
+
Parser.new(options[:ns] || {}).xpath_for selector, options
|
24
|
+
end
|
25
|
+
#end
|
26
|
+
end
|
27
|
+
end
|