nekohtml 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +39 -0
- data/Rakefile +54 -0
- data/VERSION +1 -0
- data/lib/nekohtml.rb +35 -0
- data/lib/nekohtml/html_document.rb +78 -0
- data/test/helper.rb +5 -0
- data/test/test_html_node.rb +31 -0
- data/test/test_html_parser.rb +71 -0
- metadata +97 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Alex Young
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
= nekohtml
|
2
|
+
|
3
|
+
A thin wrapper around NekoHTML as provided by Celerity.
|
4
|
+
|
5
|
+
At the moment this gem depends on Celerity to provide the nekohtml jar.
|
6
|
+
Once I can figure out how to make this optional, I'll provide it here if
|
7
|
+
the celerity gem isn't here at install time.
|
8
|
+
|
9
|
+
== Usage
|
10
|
+
|
11
|
+
jruby-1.4.0 > require 'nekohtml'
|
12
|
+
=> true
|
13
|
+
jruby-1.4.0 > html= "<html><head><title>Title of Majesty</title></head></html>"
|
14
|
+
=> "<html><head><title>Title of Majesty</title></head></html>"
|
15
|
+
jruby-1.4.0 > doc= Nekohtml.parse(html)
|
16
|
+
=> #<Nekohtml::HtmlDocument:0x3f70119f ... >
|
17
|
+
jruby-1.4.0 > doc.search("//TITLE")
|
18
|
+
=> #<Nekohtml::HtmlNodeList:0x1a7b5617 ... >
|
19
|
+
jruby-1.4.0 > _.first.text
|
20
|
+
=> "Title of Majesty"
|
21
|
+
|
22
|
+
Note that the xpath must use all-caps for tag names. This is a limitation
|
23
|
+
of NekoHTML; I may plunder Celerity's source to see how they/HtmlUnit handle
|
24
|
+
it but for now, that's what you've got.
|
25
|
+
|
26
|
+
== Note on Patches/Pull Requests
|
27
|
+
|
28
|
+
* Fork the project.
|
29
|
+
* Make your feature addition or bug fix.
|
30
|
+
* Add tests for it. This is important so I don't break it in a
|
31
|
+
future version unintentionally.
|
32
|
+
* Commit, do not mess with rakefile, version, or history.
|
33
|
+
(if you want to have your own version, that is fine but bump version in a
|
34
|
+
commit by itself I can ignore when I pull)
|
35
|
+
* Send me a pull request. Bonus points for topic branches.
|
36
|
+
|
37
|
+
== Copyright
|
38
|
+
|
39
|
+
Copyright (c) 2010 Alex Young. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "nekohtml"
|
8
|
+
gem.summary = %Q{Thin wrapper around the NekoHTML parser}
|
9
|
+
gem.description = %Q{Almost the briefest possible wrapper around the NekoHTML parser to provide xpath functionality.}
|
10
|
+
gem.email = "alex@blackkettle.org"
|
11
|
+
gem.homepage = "http://github.com/regularfry/nekohtml"
|
12
|
+
gem.authors = ["Alex Young"]
|
13
|
+
gem.add_dependency "celerity", ">=0"
|
14
|
+
gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
|
15
|
+
gem.add_development_dependency "yard", ">= 0"
|
16
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
|
+
end
|
18
|
+
Jeweler::GemcutterTasks.new
|
19
|
+
rescue LoadError
|
20
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
|
+
end
|
22
|
+
|
23
|
+
require 'rake/testtask'
|
24
|
+
Rake::TestTask.new(:test) do |test|
|
25
|
+
test.libs << 'lib' << 'test'
|
26
|
+
test.pattern = 'test/**/test_*.rb'
|
27
|
+
test.verbose = true
|
28
|
+
end
|
29
|
+
|
30
|
+
begin
|
31
|
+
require 'rcov/rcovtask'
|
32
|
+
Rcov::RcovTask.new do |test|
|
33
|
+
test.libs << 'test'
|
34
|
+
test.pattern = 'test/**/test_*.rb'
|
35
|
+
test.verbose = true
|
36
|
+
end
|
37
|
+
rescue LoadError
|
38
|
+
task :rcov do
|
39
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
task :test => :check_dependencies
|
44
|
+
|
45
|
+
task :default => :test
|
46
|
+
|
47
|
+
begin
|
48
|
+
require 'yard'
|
49
|
+
YARD::Rake::YardocTask.new
|
50
|
+
rescue LoadError
|
51
|
+
task :yardoc do
|
52
|
+
abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
|
53
|
+
end
|
54
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.3
|
data/lib/nekohtml.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'celerity'
|
2
|
+
require 'nekohtml/html_document'
|
3
|
+
|
4
|
+
module Nekohtml
|
5
|
+
class << self
|
6
|
+
|
7
|
+
def parser()
|
8
|
+
configuration = org.cyberneko.html.HTMLConfiguration.new
|
9
|
+
jparser = org.apache.xerces.parsers.DOMParser.new(configuration)
|
10
|
+
jparser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
|
11
|
+
jparser.setFeature("http://xml.org/sax/features/namespaces", false)
|
12
|
+
return jparser
|
13
|
+
end
|
14
|
+
|
15
|
+
# Parse the string. case_sensitive controls whether you can use lower-case xpath
|
16
|
+
# elements for tag names or not. case_sensitive=true uses the default NekoHTML
|
17
|
+
# parser, which forces everything to be upper case per HTML 4.01. This is a pain.
|
18
|
+
def parse(string)
|
19
|
+
if string
|
20
|
+
jparser = parser()
|
21
|
+
|
22
|
+
jinput_reader = java.io.StringReader.new(string.to_java_string)
|
23
|
+
jinput_source = org.xml.sax.InputSource.new(jinput_reader)
|
24
|
+
jparser.parse(jinput_source)
|
25
|
+
jdocument = jparser.get_document()
|
26
|
+
# We know that the document has successfully been parsed
|
27
|
+
# at this point.
|
28
|
+
|
29
|
+
return HtmlDocument.new(jdocument)
|
30
|
+
else
|
31
|
+
raise ArgumentError.new
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module Nekohtml
|
2
|
+
class HtmlThing
|
3
|
+
|
4
|
+
attr_accessor :java_object
|
5
|
+
def initialize(java_object)
|
6
|
+
@java_object = java_object
|
7
|
+
|
8
|
+
@jxpath_factory =
|
9
|
+
javax.xml.xpath.XPathFactory.newInstance()
|
10
|
+
end
|
11
|
+
|
12
|
+
def do_search(xpath, settings)
|
13
|
+
jxpath_object = @jxpath_factory.newXPath()
|
14
|
+
jmaybe_node_list = begin
|
15
|
+
jxpath_object.evaluate(
|
16
|
+
xpath,
|
17
|
+
@java_object,
|
18
|
+
settings
|
19
|
+
)
|
20
|
+
rescue
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
return jmaybe_node_list
|
24
|
+
end
|
25
|
+
|
26
|
+
def search(xpath)
|
27
|
+
@jxpath_settings = javax.xml.xpath.XPathConstants::NODESET
|
28
|
+
jnode_list = self.do_search(xpath, @jxpath_settings)
|
29
|
+
|
30
|
+
result = jnode_list ? HtmlNodeList.new(jnode_list) : nil
|
31
|
+
end
|
32
|
+
|
33
|
+
def at(xpath)
|
34
|
+
@jxpath_settings = javax.xml.xpath.XPathConstants::NODE
|
35
|
+
jnode = self.do_search(xpath, @jxpath_settings)
|
36
|
+
|
37
|
+
result = jnode ? HtmlNode.new(jnode) : nil
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class HtmlDocument < HtmlThing; end
|
42
|
+
|
43
|
+
class HtmlNodeList < HtmlThing
|
44
|
+
# @java_object is a NodeList in this case
|
45
|
+
include Enumerable
|
46
|
+
|
47
|
+
def initialize(*args)
|
48
|
+
super
|
49
|
+
# Just an alias
|
50
|
+
@jnode_list = @java_object
|
51
|
+
end
|
52
|
+
|
53
|
+
def length
|
54
|
+
@jnode_list.getLength()
|
55
|
+
end
|
56
|
+
|
57
|
+
def each
|
58
|
+
@jnode_list.getLength().times do |i|
|
59
|
+
yield HtmlNode.new(@jnode_list.item(i))
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
class HtmlNode < HtmlThing
|
65
|
+
def initialize(java_object)
|
66
|
+
super
|
67
|
+
@jelement = @java_object
|
68
|
+
end
|
69
|
+
|
70
|
+
def text
|
71
|
+
@jelement.text_content
|
72
|
+
end
|
73
|
+
def value
|
74
|
+
return self.text
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'helper'))
|
2
|
+
require 'nekohtml/html_document'
|
3
|
+
|
4
|
+
class TestHtmlNode < Test::Unit::TestCase
|
5
|
+
include Nekohtml
|
6
|
+
|
7
|
+
def setup
|
8
|
+
@java_node = stub("(java_node)")
|
9
|
+
@node = HtmlNode.new(@java_node)
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_search
|
13
|
+
xpath = "//example/xpath"
|
14
|
+
@node.expects(:do_search).with(xpath,
|
15
|
+
javax.xml.xpath.XPathConstants::NODESET)
|
16
|
+
|
17
|
+
@node.search(xpath)
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_text
|
21
|
+
@java_node.expects(:text_content).returns("")
|
22
|
+
@node.text
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_value
|
26
|
+
@java_node.expects(:text_content).returns(nil)
|
27
|
+
@node.value
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
require 'test/helper'
|
2
|
+
require 'nekohtml'
|
3
|
+
|
4
|
+
class TestNekohtml < Test::Unit::TestCase
|
5
|
+
def test_nil_content_raises
|
6
|
+
assert_raises(ArgumentError) do
|
7
|
+
Nekohtml.parse(nil)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_parse_simple_content
|
12
|
+
content = "<html><head></head><body></body></html>"
|
13
|
+
assert_not_nil Nekohtml.parse(content)
|
14
|
+
end
|
15
|
+
|
16
|
+
###
|
17
|
+
# This is something of an integration test
|
18
|
+
def test_parse_to_xpath
|
19
|
+
content = <<-HTML
|
20
|
+
<html>
|
21
|
+
<head>
|
22
|
+
<title>HTML Page Title</title>
|
23
|
+
</head>
|
24
|
+
<body>
|
25
|
+
<ul>
|
26
|
+
<li>Foo</li>
|
27
|
+
<li>Bar</li>
|
28
|
+
</ul>
|
29
|
+
</body>
|
30
|
+
</html>
|
31
|
+
HTML
|
32
|
+
|
33
|
+
root_node = Nekohtml.parse(content)
|
34
|
+
flunk "Bad root node" if root_node.nil?
|
35
|
+
|
36
|
+
xpath = "//li"
|
37
|
+
|
38
|
+
search_results = root_node.search(xpath)
|
39
|
+
|
40
|
+
assert_equal 2, search_results.length
|
41
|
+
assert_equal ["Foo", "Bar"], search_results.map{|r| r.text}
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_html_with_namespaces
|
45
|
+
content = <<-HTML
|
46
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
47
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
48
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
49
|
+
<head></head>
|
50
|
+
</html>
|
51
|
+
HTML
|
52
|
+
|
53
|
+
root_node = Nekohtml.parse(content)
|
54
|
+
assert_not_nil root_node
|
55
|
+
|
56
|
+
xpath = "//head"
|
57
|
+
search_results = root_node.search(xpath)
|
58
|
+
|
59
|
+
assert_equal 1, search_results.length
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_not_case_sensitive
|
63
|
+
content = "<html> <head><title>Foo</title></head> <body><h1>Heading</h1></body> </html>"
|
64
|
+
root_node = Nekohtml.parse(content)
|
65
|
+
xpath = "//h1"
|
66
|
+
search_result = root_node.at(xpath)
|
67
|
+
assert_not_nil search_result, "Nothing found with the xpath #{xpath}."
|
68
|
+
assert_equal "Heading", search_result.text
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
metadata
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: nekohtml
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.3
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Alex Young
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-02-15 00:00:00 +00:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: celerity
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: thoughtbot-shoulda
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: yard
|
37
|
+
type: :development
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
description: Almost the briefest possible wrapper around the NekoHTML parser to provide xpath functionality.
|
46
|
+
email: alex@blackkettle.org
|
47
|
+
executables: []
|
48
|
+
|
49
|
+
extensions: []
|
50
|
+
|
51
|
+
extra_rdoc_files:
|
52
|
+
- LICENSE
|
53
|
+
- README.rdoc
|
54
|
+
files:
|
55
|
+
- .document
|
56
|
+
- .gitignore
|
57
|
+
- LICENSE
|
58
|
+
- README.rdoc
|
59
|
+
- Rakefile
|
60
|
+
- VERSION
|
61
|
+
- lib/nekohtml.rb
|
62
|
+
- lib/nekohtml/html_document.rb
|
63
|
+
- test/helper.rb
|
64
|
+
- test/test_html_node.rb
|
65
|
+
- test/test_html_parser.rb
|
66
|
+
has_rdoc: true
|
67
|
+
homepage: http://github.com/regularfry/nekohtml
|
68
|
+
licenses: []
|
69
|
+
|
70
|
+
post_install_message:
|
71
|
+
rdoc_options:
|
72
|
+
- --charset=UTF-8
|
73
|
+
require_paths:
|
74
|
+
- lib
|
75
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: "0"
|
80
|
+
version:
|
81
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: "0"
|
86
|
+
version:
|
87
|
+
requirements: []
|
88
|
+
|
89
|
+
rubyforge_project:
|
90
|
+
rubygems_version: 1.3.5
|
91
|
+
signing_key:
|
92
|
+
specification_version: 3
|
93
|
+
summary: Thin wrapper around the NekoHTML parser
|
94
|
+
test_files:
|
95
|
+
- test/test_html_parser.rb
|
96
|
+
- test/test_html_node.rb
|
97
|
+
- test/helper.rb
|