nekohtml 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Alex Young
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,39 @@
1
+ = nekohtml
2
+
3
+ A thin wrapper around NekoHTML as provided by Celerity.
4
+
5
+ At the moment this gem depends on Celerity to provide the nekohtml jar.
6
+ Once I can figure out how to make this optional, I'll provide it here if
7
+ the celerity gem isn't here at install time.
8
+
9
+ == Usage
10
+
11
+ jruby-1.4.0 > require 'nekohtml'
12
+ => true
13
+ jruby-1.4.0 > html= "<html><head><title>Title of Majesty</title></head></html>"
14
+ => "<html><head><title>Title of Majesty</title></head></html>"
15
+ jruby-1.4.0 > doc= Nekohtml.parse(html)
16
+ => #<Nekohtml::HtmlDocument:0x3f70119f ... >
17
+ jruby-1.4.0 > doc.search("//TITLE")
18
+ => #<Nekohtml::HtmlNodeList:0x1a7b5617 ... >
19
+ jruby-1.4.0 > _.first.text
20
+ => "Title of Majesty"
21
+
22
+ Note that the xpath must use all-caps for tag names. This is a limitation
23
+ of NekoHTML; I may plunder Celerity's source to see how they/HtmlUnit handle
24
+ it but for now, that's what you've got.
25
+
26
+ == Note on Patches/Pull Requests
27
+
28
+ * Fork the project.
29
+ * Make your feature addition or bug fix.
30
+ * Add tests for it. This is important so I don't break it in a
31
+ future version unintentionally.
32
+ * Commit, do not mess with rakefile, version, or history.
33
+ (if you want to have your own version, that is fine but bump version in a
34
+ commit by itself I can ignore when I pull)
35
+ * Send me a pull request. Bonus points for topic branches.
36
+
37
+ == Copyright
38
+
39
+ Copyright (c) 2010 Alex Young. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,54 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "nekohtml"
8
+ gem.summary = %Q{Thin wrapper around the NekoHTML parser}
9
+ gem.description = %Q{Almost the briefest possible wrapper around the NekoHTML parser to provide xpath functionality.}
10
+ gem.email = "alex@blackkettle.org"
11
+ gem.homepage = "http://github.com/regularfry/nekohtml"
12
+ gem.authors = ["Alex Young"]
13
+ gem.add_dependency "celerity", ">=0"
14
+ gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
15
+ gem.add_development_dependency "yard", ">= 0"
16
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'rake/testtask'
24
+ Rake::TestTask.new(:test) do |test|
25
+ test.libs << 'lib' << 'test'
26
+ test.pattern = 'test/**/test_*.rb'
27
+ test.verbose = true
28
+ end
29
+
30
+ begin
31
+ require 'rcov/rcovtask'
32
+ Rcov::RcovTask.new do |test|
33
+ test.libs << 'test'
34
+ test.pattern = 'test/**/test_*.rb'
35
+ test.verbose = true
36
+ end
37
+ rescue LoadError
38
+ task :rcov do
39
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
40
+ end
41
+ end
42
+
43
+ task :test => :check_dependencies
44
+
45
+ task :default => :test
46
+
47
+ begin
48
+ require 'yard'
49
+ YARD::Rake::YardocTask.new
50
+ rescue LoadError
51
+ task :yardoc do
52
+ abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
53
+ end
54
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.3
data/lib/nekohtml.rb ADDED
@@ -0,0 +1,35 @@
1
+ require 'celerity'
2
+ require 'nekohtml/html_document'
3
+
4
+ module Nekohtml
5
+ class << self
6
+
7
+ def parser()
8
+ configuration = org.cyberneko.html.HTMLConfiguration.new
9
+ jparser = org.apache.xerces.parsers.DOMParser.new(configuration)
10
+ jparser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
11
+ jparser.setFeature("http://xml.org/sax/features/namespaces", false)
12
+ return jparser
13
+ end
14
+
15
+ # Parse the string. case_sensitive controls whether you can use lower-case xpath
16
+ # elements for tag names or not. case_sensitive=true uses the default NekoHTML
17
+ # parser, which forces everything to be upper case per HTML 4.01. This is a pain.
18
+ def parse(string)
19
+ if string
20
+ jparser = parser()
21
+
22
+ jinput_reader = java.io.StringReader.new(string.to_java_string)
23
+ jinput_source = org.xml.sax.InputSource.new(jinput_reader)
24
+ jparser.parse(jinput_source)
25
+ jdocument = jparser.get_document()
26
+ # We know that the document has successfully been parsed
27
+ # at this point.
28
+
29
+ return HtmlDocument.new(jdocument)
30
+ else
31
+ raise ArgumentError.new
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,78 @@
1
+ module Nekohtml
2
+ class HtmlThing
3
+
4
+ attr_accessor :java_object
5
+ def initialize(java_object)
6
+ @java_object = java_object
7
+
8
+ @jxpath_factory =
9
+ javax.xml.xpath.XPathFactory.newInstance()
10
+ end
11
+
12
+ def do_search(xpath, settings)
13
+ jxpath_object = @jxpath_factory.newXPath()
14
+ jmaybe_node_list = begin
15
+ jxpath_object.evaluate(
16
+ xpath,
17
+ @java_object,
18
+ settings
19
+ )
20
+ rescue
21
+ nil
22
+ end
23
+ return jmaybe_node_list
24
+ end
25
+
26
+ def search(xpath)
27
+ @jxpath_settings = javax.xml.xpath.XPathConstants::NODESET
28
+ jnode_list = self.do_search(xpath, @jxpath_settings)
29
+
30
+ result = jnode_list ? HtmlNodeList.new(jnode_list) : nil
31
+ end
32
+
33
+ def at(xpath)
34
+ @jxpath_settings = javax.xml.xpath.XPathConstants::NODE
35
+ jnode = self.do_search(xpath, @jxpath_settings)
36
+
37
+ result = jnode ? HtmlNode.new(jnode) : nil
38
+ end
39
+ end
40
+
41
+ class HtmlDocument < HtmlThing; end
42
+
43
+ class HtmlNodeList < HtmlThing
44
+ # @java_object is a NodeList in this case
45
+ include Enumerable
46
+
47
+ def initialize(*args)
48
+ super
49
+ # Just an alias
50
+ @jnode_list = @java_object
51
+ end
52
+
53
+ def length
54
+ @jnode_list.getLength()
55
+ end
56
+
57
+ def each
58
+ @jnode_list.getLength().times do |i|
59
+ yield HtmlNode.new(@jnode_list.item(i))
60
+ end
61
+ end
62
+ end
63
+
64
+ class HtmlNode < HtmlThing
65
+ def initialize(java_object)
66
+ super
67
+ @jelement = @java_object
68
+ end
69
+
70
+ def text
71
+ @jelement.text_content
72
+ end
73
+ def value
74
+ return self.text
75
+ end
76
+ end
77
+
78
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,5 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'mocha'
4
+ gem 'celerity'
5
+ $:.push 'lib'
@@ -0,0 +1,31 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'helper'))
2
+ require 'nekohtml/html_document'
3
+
4
+ class TestHtmlNode < Test::Unit::TestCase
5
+ include Nekohtml
6
+
7
+ def setup
8
+ @java_node = stub("(java_node)")
9
+ @node = HtmlNode.new(@java_node)
10
+ end
11
+
12
+ def test_search
13
+ xpath = "//example/xpath"
14
+ @node.expects(:do_search).with(xpath,
15
+ javax.xml.xpath.XPathConstants::NODESET)
16
+
17
+ @node.search(xpath)
18
+ end
19
+
20
+ def test_text
21
+ @java_node.expects(:text_content).returns("")
22
+ @node.text
23
+ end
24
+
25
+ def test_value
26
+ @java_node.expects(:text_content).returns(nil)
27
+ @node.value
28
+ end
29
+
30
+
31
+ end
@@ -0,0 +1,71 @@
1
+ require 'test/helper'
2
+ require 'nekohtml'
3
+
4
+ class TestNekohtml < Test::Unit::TestCase
5
+ def test_nil_content_raises
6
+ assert_raises(ArgumentError) do
7
+ Nekohtml.parse(nil)
8
+ end
9
+ end
10
+
11
+ def test_parse_simple_content
12
+ content = "<html><head></head><body></body></html>"
13
+ assert_not_nil Nekohtml.parse(content)
14
+ end
15
+
16
+ ###
17
+ # This is something of an integration test
18
+ def test_parse_to_xpath
19
+ content = <<-HTML
20
+ <html>
21
+ <head>
22
+ <title>HTML Page Title</title>
23
+ </head>
24
+ <body>
25
+ <ul>
26
+ <li>Foo</li>
27
+ <li>Bar</li>
28
+ </ul>
29
+ </body>
30
+ </html>
31
+ HTML
32
+
33
+ root_node = Nekohtml.parse(content)
34
+ flunk "Bad root node" if root_node.nil?
35
+
36
+ xpath = "//li"
37
+
38
+ search_results = root_node.search(xpath)
39
+
40
+ assert_equal 2, search_results.length
41
+ assert_equal ["Foo", "Bar"], search_results.map{|r| r.text}
42
+ end
43
+
44
+ def test_html_with_namespaces
45
+ content = <<-HTML
46
+ <?xml version="1.0" encoding="UTF-8"?>
47
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
48
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
49
+ <head></head>
50
+ </html>
51
+ HTML
52
+
53
+ root_node = Nekohtml.parse(content)
54
+ assert_not_nil root_node
55
+
56
+ xpath = "//head"
57
+ search_results = root_node.search(xpath)
58
+
59
+ assert_equal 1, search_results.length
60
+ end
61
+
62
+ def test_not_case_sensitive
63
+ content = "<html> <head><title>Foo</title></head> <body><h1>Heading</h1></body> </html>"
64
+ root_node = Nekohtml.parse(content)
65
+ xpath = "//h1"
66
+ search_result = root_node.at(xpath)
67
+ assert_not_nil search_result, "Nothing found with the xpath #{xpath}."
68
+ assert_equal "Heading", search_result.text
69
+ end
70
+
71
+ end
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nekohtml
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ platform: ruby
6
+ authors:
7
+ - Alex Young
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-02-15 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: celerity
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: thoughtbot-shoulda
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: yard
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ description: Almost the briefest possible wrapper around the NekoHTML parser to provide xpath functionality.
46
+ email: alex@blackkettle.org
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ extra_rdoc_files:
52
+ - LICENSE
53
+ - README.rdoc
54
+ files:
55
+ - .document
56
+ - .gitignore
57
+ - LICENSE
58
+ - README.rdoc
59
+ - Rakefile
60
+ - VERSION
61
+ - lib/nekohtml.rb
62
+ - lib/nekohtml/html_document.rb
63
+ - test/helper.rb
64
+ - test/test_html_node.rb
65
+ - test/test_html_parser.rb
66
+ has_rdoc: true
67
+ homepage: http://github.com/regularfry/nekohtml
68
+ licenses: []
69
+
70
+ post_install_message:
71
+ rdoc_options:
72
+ - --charset=UTF-8
73
+ require_paths:
74
+ - lib
75
+ required_ruby_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: "0"
80
+ version:
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ version:
87
+ requirements: []
88
+
89
+ rubyforge_project:
90
+ rubygems_version: 1.3.5
91
+ signing_key:
92
+ specification_version: 3
93
+ summary: Thin wrapper around the NekoHTML parser
94
+ test_files:
95
+ - test/test_html_parser.rb
96
+ - test/test_html_node.rb
97
+ - test/helper.rb