nekohtml 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Alex Young
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,39 @@
1
+ = nekohtml
2
+
3
+ A thin wrapper around NekoHTML as provided by Celerity.
4
+
5
+ At the moment this gem depends on Celerity to provide the nekohtml jar.
6
+ Once I can figure out how to make this optional, I'll provide it here if
7
+ the celerity gem isn't here at install time.
8
+
9
+ == Usage
10
+
11
+ jruby-1.4.0 > require 'nekohtml'
12
+ => true
13
+ jruby-1.4.0 > html= "<html><head><title>Title of Majesty</title></head></html>"
14
+ => "<html><head><title>Title of Majesty</title></head></html>"
15
+ jruby-1.4.0 > doc= Nekohtml.parse(html)
16
+ => #<Nekohtml::HtmlDocument:0x3f70119f ... >
17
+ jruby-1.4.0 > doc.search("//TITLE")
18
+ => #<Nekohtml::HtmlNodeList:0x1a7b5617 ... >
19
+ jruby-1.4.0 > _.first.text
20
+ => "Title of Majesty"
21
+
22
+ Note that the xpath must use all-caps for tag names. This is a limitation
23
+ of NekoHTML; I may plunder Celerity's source to see how they/HtmlUnit handle
24
+ it but for now, that's what you've got.
25
+
26
+ == Note on Patches/Pull Requests
27
+
28
+ * Fork the project.
29
+ * Make your feature addition or bug fix.
30
+ * Add tests for it. This is important so I don't break it in a
31
+ future version unintentionally.
32
+ * Commit, do not mess with rakefile, version, or history.
33
+ (if you want to have your own version, that is fine but bump version in a
34
+ commit by itself I can ignore when I pull)
35
+ * Send me a pull request. Bonus points for topic branches.
36
+
37
+ == Copyright
38
+
39
+ Copyright (c) 2010 Alex Young. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,54 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "nekohtml"
8
+ gem.summary = %Q{Thin wrapper around the NekoHTML parser}
9
+ gem.description = %Q{Almost the briefest possible wrapper around the NekoHTML parser to provide xpath functionality.}
10
+ gem.email = "alex@blackkettle.org"
11
+ gem.homepage = "http://github.com/regularfry/nekohtml"
12
+ gem.authors = ["Alex Young"]
13
+ gem.add_dependency "celerity", ">=0"
14
+ gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
15
+ gem.add_development_dependency "yard", ">= 0"
16
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'rake/testtask'
24
+ Rake::TestTask.new(:test) do |test|
25
+ test.libs << 'lib' << 'test'
26
+ test.pattern = 'test/**/test_*.rb'
27
+ test.verbose = true
28
+ end
29
+
30
+ begin
31
+ require 'rcov/rcovtask'
32
+ Rcov::RcovTask.new do |test|
33
+ test.libs << 'test'
34
+ test.pattern = 'test/**/test_*.rb'
35
+ test.verbose = true
36
+ end
37
+ rescue LoadError
38
+ task :rcov do
39
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
40
+ end
41
+ end
42
+
43
+ task :test => :check_dependencies
44
+
45
+ task :default => :test
46
+
47
+ begin
48
+ require 'yard'
49
+ YARD::Rake::YardocTask.new
50
+ rescue LoadError
51
+ task :yardoc do
52
+ abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
53
+ end
54
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.3
data/lib/nekohtml.rb ADDED
@@ -0,0 +1,35 @@
1
+ require 'celerity'
2
+ require 'nekohtml/html_document'
3
+
4
+ module Nekohtml
5
+ class << self
6
+
7
+ def parser()
8
+ configuration = org.cyberneko.html.HTMLConfiguration.new
9
+ jparser = org.apache.xerces.parsers.DOMParser.new(configuration)
10
+ jparser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
11
+ jparser.setFeature("http://xml.org/sax/features/namespaces", false)
12
+ return jparser
13
+ end
14
+
15
+ # Parse the string. case_sensitive controls whether you can use lower-case xpath
16
+ # elements for tag names or not. case_sensitive=true uses the default NekoHTML
17
+ # parser, which forces everything to be upper case per HTML 4.01. This is a pain.
18
+ def parse(string)
19
+ if string
20
+ jparser = parser()
21
+
22
+ jinput_reader = java.io.StringReader.new(string.to_java_string)
23
+ jinput_source = org.xml.sax.InputSource.new(jinput_reader)
24
+ jparser.parse(jinput_source)
25
+ jdocument = jparser.get_document()
26
+ # We know that the document has successfully been parsed
27
+ # at this point.
28
+
29
+ return HtmlDocument.new(jdocument)
30
+ else
31
+ raise ArgumentError.new
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,78 @@
1
+ module Nekohtml
2
+ class HtmlThing
3
+
4
+ attr_accessor :java_object
5
+ def initialize(java_object)
6
+ @java_object = java_object
7
+
8
+ @jxpath_factory =
9
+ javax.xml.xpath.XPathFactory.newInstance()
10
+ end
11
+
12
+ def do_search(xpath, settings)
13
+ jxpath_object = @jxpath_factory.newXPath()
14
+ jmaybe_node_list = begin
15
+ jxpath_object.evaluate(
16
+ xpath,
17
+ @java_object,
18
+ settings
19
+ )
20
+ rescue
21
+ nil
22
+ end
23
+ return jmaybe_node_list
24
+ end
25
+
26
+ def search(xpath)
27
+ @jxpath_settings = javax.xml.xpath.XPathConstants::NODESET
28
+ jnode_list = self.do_search(xpath, @jxpath_settings)
29
+
30
+ result = jnode_list ? HtmlNodeList.new(jnode_list) : nil
31
+ end
32
+
33
+ def at(xpath)
34
+ @jxpath_settings = javax.xml.xpath.XPathConstants::NODE
35
+ jnode = self.do_search(xpath, @jxpath_settings)
36
+
37
+ result = jnode ? HtmlNode.new(jnode) : nil
38
+ end
39
+ end
40
+
41
+ class HtmlDocument < HtmlThing; end
42
+
43
+ class HtmlNodeList < HtmlThing
44
+ # @java_object is a NodeList in this case
45
+ include Enumerable
46
+
47
+ def initialize(*args)
48
+ super
49
+ # Just an alias
50
+ @jnode_list = @java_object
51
+ end
52
+
53
+ def length
54
+ @jnode_list.getLength()
55
+ end
56
+
57
+ def each
58
+ @jnode_list.getLength().times do |i|
59
+ yield HtmlNode.new(@jnode_list.item(i))
60
+ end
61
+ end
62
+ end
63
+
64
+ class HtmlNode < HtmlThing
65
+ def initialize(java_object)
66
+ super
67
+ @jelement = @java_object
68
+ end
69
+
70
+ def text
71
+ @jelement.text_content
72
+ end
73
+ def value
74
+ return self.text
75
+ end
76
+ end
77
+
78
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,5 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'mocha'
4
+ gem 'celerity'
5
+ $:.push 'lib'
@@ -0,0 +1,31 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'helper'))
2
+ require 'nekohtml/html_document'
3
+
4
+ class TestHtmlNode < Test::Unit::TestCase
5
+ include Nekohtml
6
+
7
+ def setup
8
+ @java_node = stub("(java_node)")
9
+ @node = HtmlNode.new(@java_node)
10
+ end
11
+
12
+ def test_search
13
+ xpath = "//example/xpath"
14
+ @node.expects(:do_search).with(xpath,
15
+ javax.xml.xpath.XPathConstants::NODESET)
16
+
17
+ @node.search(xpath)
18
+ end
19
+
20
+ def test_text
21
+ @java_node.expects(:text_content).returns("")
22
+ @node.text
23
+ end
24
+
25
+ def test_value
26
+ @java_node.expects(:text_content).returns(nil)
27
+ @node.value
28
+ end
29
+
30
+
31
+ end
@@ -0,0 +1,71 @@
1
+ require 'test/helper'
2
+ require 'nekohtml'
3
+
4
+ class TestNekohtml < Test::Unit::TestCase
5
+ def test_nil_content_raises
6
+ assert_raises(ArgumentError) do
7
+ Nekohtml.parse(nil)
8
+ end
9
+ end
10
+
11
+ def test_parse_simple_content
12
+ content = "<html><head></head><body></body></html>"
13
+ assert_not_nil Nekohtml.parse(content)
14
+ end
15
+
16
+ ###
17
+ # This is something of an integration test
18
+ def test_parse_to_xpath
19
+ content = <<-HTML
20
+ <html>
21
+ <head>
22
+ <title>HTML Page Title</title>
23
+ </head>
24
+ <body>
25
+ <ul>
26
+ <li>Foo</li>
27
+ <li>Bar</li>
28
+ </ul>
29
+ </body>
30
+ </html>
31
+ HTML
32
+
33
+ root_node = Nekohtml.parse(content)
34
+ flunk "Bad root node" if root_node.nil?
35
+
36
+ xpath = "//li"
37
+
38
+ search_results = root_node.search(xpath)
39
+
40
+ assert_equal 2, search_results.length
41
+ assert_equal ["Foo", "Bar"], search_results.map{|r| r.text}
42
+ end
43
+
44
+ def test_html_with_namespaces
45
+ content = <<-HTML
46
+ <?xml version="1.0" encoding="UTF-8"?>
47
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
48
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
49
+ <head></head>
50
+ </html>
51
+ HTML
52
+
53
+ root_node = Nekohtml.parse(content)
54
+ assert_not_nil root_node
55
+
56
+ xpath = "//head"
57
+ search_results = root_node.search(xpath)
58
+
59
+ assert_equal 1, search_results.length
60
+ end
61
+
62
+ def test_not_case_sensitive
63
+ content = "<html> <head><title>Foo</title></head> <body><h1>Heading</h1></body> </html>"
64
+ root_node = Nekohtml.parse(content)
65
+ xpath = "//h1"
66
+ search_result = root_node.at(xpath)
67
+ assert_not_nil search_result, "Nothing found with the xpath #{xpath}."
68
+ assert_equal "Heading", search_result.text
69
+ end
70
+
71
+ end
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nekohtml
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ platform: ruby
6
+ authors:
7
+ - Alex Young
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-02-15 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: celerity
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: thoughtbot-shoulda
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: yard
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ description: Almost the briefest possible wrapper around the NekoHTML parser to provide xpath functionality.
46
+ email: alex@blackkettle.org
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ extra_rdoc_files:
52
+ - LICENSE
53
+ - README.rdoc
54
+ files:
55
+ - .document
56
+ - .gitignore
57
+ - LICENSE
58
+ - README.rdoc
59
+ - Rakefile
60
+ - VERSION
61
+ - lib/nekohtml.rb
62
+ - lib/nekohtml/html_document.rb
63
+ - test/helper.rb
64
+ - test/test_html_node.rb
65
+ - test/test_html_parser.rb
66
+ has_rdoc: true
67
+ homepage: http://github.com/regularfry/nekohtml
68
+ licenses: []
69
+
70
+ post_install_message:
71
+ rdoc_options:
72
+ - --charset=UTF-8
73
+ require_paths:
74
+ - lib
75
+ required_ruby_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: "0"
80
+ version:
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ version:
87
+ requirements: []
88
+
89
+ rubyforge_project:
90
+ rubygems_version: 1.3.5
91
+ signing_key:
92
+ specification_version: 3
93
+ summary: Thin wrapper around the NekoHTML parser
94
+ test_files:
95
+ - test/test_html_parser.rb
96
+ - test/test_html_node.rb
97
+ - test/helper.rb