regularfry-celerity_parser 0.1.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Alex Young
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,57 @@
1
+ celerity_parser
2
+ by Alex Young
3
+ http://github.com/regularfry/celerity_parser
4
+
5
+ == DESCRIPTION:
6
+
7
+ A very thin wrapper around HtmlUnit's HTML parser to allow xpath searches of
8
+ HTML documents for JRuby. At the moment there's not much here. I don't know yet
9
+ whether it's worth expanding the API to a full nokogiri/hpricot-style
10
+ implementation. What's here works for what I need for now.
11
+
12
+ == FEATURES/PROBLEMS:
13
+
14
+ * Reuses HtmlUnit's parser as wrapped by celerity to provide a stable HTML
15
+ * parsing capability.
16
+
17
+ == SYNOPSIS:
18
+
19
+ Basic use:
20
+
21
+ root_node = CelerityParser.parse(html_content)
22
+ found_elements = root_node.search("//html/head/title")
23
+ found_elements.first.text # => "Html page title"
24
+
25
+
26
+ == REQUIREMENTS:
27
+
28
+ $ jruby -S gem install jarib-celerity --source=http://gems.github.com
29
+
30
+ == INSTALL:
31
+
32
+ $ jruby -S gem install celerity_parser --source=http://gems.github.com
33
+
34
+ == LICENSE:
35
+
36
+ (The MIT License)
37
+
38
+ Copyright (c) 2009 Alex Young
39
+
40
+ Permission is hereby granted, free of charge, to any person obtaining
41
+ a copy of this software and associated documentation files (the
42
+ 'Software'), to deal in the Software without restriction, including
43
+ without limitation the rights to use, copy, modify, merge, publish,
44
+ distribute, sublicense, and/or sell copies of the Software, and to
45
+ permit persons to whom the Software is furnished to do so, subject to
46
+ the following conditions:
47
+
48
+ The above copyright notice and this permission notice shall be
49
+ included in all copies or substantial portions of the Software.
50
+
51
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
52
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
53
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
54
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
55
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
56
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
57
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,60 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "celerity_parser"
8
+ gem.summary = %Q{Thin wrapper around a native Java HTML parser}
9
+ gem.email = "alex@blackkettle.org"
10
+ gem.homepage = "http://github.com/regularfry/celerity_parser"
11
+ gem.authors = ["Alex Young"]
12
+ gem.rubyforge_project = "celerity_parser"
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ gem.add_dependency 'jarib-celerity', '>= 0.0.6.10'
15
+ gem.platform = "java"
16
+ end
17
+
18
+ Jeweler::RubyforgeTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
21
+ end
22
+
23
+ require 'rake/testtask'
24
+ Rake::TestTask.new(:test) do |test|
25
+ test.libs << 'lib' << 'test'
26
+ test.pattern = 'test/**/test_*.rb'
27
+ test.verbose = true
28
+ end
29
+
30
+ begin
31
+ require 'rcov/rcovtask'
32
+ Rcov::RcovTask.new do |test|
33
+ test.libs << 'test'
34
+ test.pattern = 'test/**/*_test.rb'
35
+ test.verbose = true
36
+ end
37
+ rescue LoadError
38
+ task :rcov do
39
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
40
+ end
41
+ end
42
+
43
+
44
+ task :default => :test
45
+
46
+ require 'rake/rdoctask'
47
+ Rake::RDocTask.new do |rdoc|
48
+ if File.exist?('VERSION.yml')
49
+ config = YAML.load(File.read('VERSION.yml'))
50
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
51
+ else
52
+ version = ""
53
+ end
54
+
55
+ rdoc.rdoc_dir = 'rdoc'
56
+ rdoc.title = "celerity_parser #{version}"
57
+ rdoc.rdoc_files.include('README*')
58
+ rdoc.rdoc_files.include('lib/**/*.rb')
59
+ end
60
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.1
@@ -0,0 +1,57 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{celerity_parser}
5
+ s.version = "0.1.1"
6
+ s.platform = %q{java}
7
+
8
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
9
+ s.authors = ["Alex Young"]
10
+ s.date = %q{2009-06-22}
11
+ s.email = %q{alex@blackkettle.org}
12
+ s.extra_rdoc_files = [
13
+ "LICENSE",
14
+ "README.rdoc"
15
+ ]
16
+ s.files = [
17
+ ".document",
18
+ ".gitignore",
19
+ "LICENSE",
20
+ "README.rdoc",
21
+ "Rakefile",
22
+ "VERSION",
23
+ "celerity_parser.gemspec",
24
+ "lib/celerity_parser.rb",
25
+ "lib/celerity_parser/html_node.rb",
26
+ "lib/celerity_parser/html_parser.rb",
27
+ "test/helper.rb",
28
+ "test/test_celerity_parser.rb",
29
+ "test/test_html_node.rb",
30
+ "test/test_html_parser.rb"
31
+ ]
32
+ s.homepage = %q{http://github.com/regularfry/celerity_parser}
33
+ s.rdoc_options = ["--charset=UTF-8"]
34
+ s.require_paths = ["lib"]
35
+ s.rubyforge_project = %q{celerity_parser}
36
+ s.rubygems_version = %q{1.3.3}
37
+ s.summary = %q{Thin wrapper around a native Java HTML parser}
38
+ s.test_files = [
39
+ "test/helper.rb",
40
+ "test/test_html_node.rb",
41
+ "test/test_html_parser.rb",
42
+ "test/test_celerity_parser.rb"
43
+ ]
44
+
45
+ if s.respond_to? :specification_version then
46
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
47
+ s.specification_version = 3
48
+
49
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
50
+ s.add_runtime_dependency(%q<jarib-celerity>, [">= 0.0.6.10"])
51
+ else
52
+ s.add_dependency(%q<jarib-celerity>, [">= 0.0.6.10"])
53
+ end
54
+ else
55
+ s.add_dependency(%q<jarib-celerity>, [">= 0.0.6.10"])
56
+ end
57
+ end
@@ -0,0 +1,47 @@
1
+ module CelerityParser
2
+
3
+ class HtmlNode# {{{
4
+
5
+ # This extra entry point will allow me to insert different Node
6
+ # classes in the future if necessary.
7
+ def self.wrap(java_node)
8
+ new(java_node)
9
+ end
10
+
11
+ # Not called by user code.
12
+ def initialize(java_node)
13
+ @java_node = java_node
14
+ end
15
+
16
+ # Perform an xpath search from this node and return an array
17
+ # of HtmlNodes representing the results.
18
+ def search(xpath)
19
+ wrap_java_nodes(@java_node.get_by_xpath(xpath))
20
+ end
21
+
22
+ # The innerText of the node.
23
+ def text
24
+ @java_node.as_text
25
+ end
26
+
27
+ # HtmlNodes represent both tags and attributes, but attributes
28
+ # have value rather than text.
29
+ def value
30
+ @java_node.value
31
+ end
32
+
33
+ # The XHTML representation of this node.
34
+ def html
35
+ @java_node.as_xml
36
+ end
37
+
38
+ def wrap_java_nodes(java_results)
39
+ java_results.map do |java_node|
40
+ HtmlNode.wrap(java_node)
41
+ end
42
+ end
43
+ private :wrap_java_nodes
44
+
45
+ end # }}}
46
+
47
+ end
@@ -0,0 +1,24 @@
1
+ require 'celerity_parser/html_node'
2
+
3
+ require 'celerity'
4
+
5
+ module CelerityParser
6
+ class HtmlParser# {{{
7
+ def self.get_window(title)
8
+ client = Java.com.gargoylesoftware.htmlunit.WebClient.new()
9
+ return Java.com.gargoylesoftware.htmlunit.TopLevelWindow.new(title, client)
10
+ end
11
+
12
+ def self.get_wrapped_content(content, href=nil)
13
+ url = java.net.URL.new(href || "http://example.com")
14
+ return Java.com.gargoylesoftware.htmlunit.StringWebResponse.new(content, url)
15
+ end
16
+
17
+ def self.parse(content, href=nil)
18
+ raise ArgumentError.new("Nil content!") if !content
19
+ window = self.get_window("window")
20
+ wrapped_content = self.get_wrapped_content(content, href)
21
+ return HtmlNode.wrap(Java.com.gargoylesoftware.htmlunit.html.HTMLParser.parse(wrapped_content, window))
22
+ end
23
+ end# }}}
24
+ end
@@ -0,0 +1,56 @@
1
+ require 'celerity'
2
+
3
+ module CelerityParser
4
+
5
+ # :stopdoc:
6
+ VERSION = '1.0.0'
7
+ LIBPATH = ::File.expand_path(::File.dirname(__FILE__)) + ::File::SEPARATOR
8
+ PATH = ::File.dirname(LIBPATH) + ::File::SEPARATOR
9
+ # :startdoc:
10
+
11
+ # Returns the version string for the library.
12
+ #
13
+ def self.version
14
+ VERSION
15
+ end
16
+
17
+ # Returns the library path for the module. If any arguments are given,
18
+ # they will be joined to the end of the libray path using
19
+ # <tt>File.join</tt>.
20
+ #
21
+ def self.libpath( *args )
22
+ args.empty? ? LIBPATH : ::File.join(LIBPATH, args.flatten)
23
+ end
24
+
25
+ # Returns the lpath for the module. If any arguments are given,
26
+ # they will be joined to the end of the path using
27
+ # <tt>File.join</tt>.
28
+ #
29
+ def self.path( *args )
30
+ args.empty? ? PATH : ::File.join(PATH, args.flatten)
31
+ end
32
+
33
+ # Utility method used to require all files ending in .rb that lie in the
34
+ # directory below this file that has the same name as the filename passed
35
+ # in. Optionally, a specific _directory_ name can be passed in such that
36
+ # the _filename_ does not have to be equivalent to the directory.
37
+ #
38
+ def self.require_all_libs_relative_to( fname, dir = nil )
39
+ dir ||= ::File.basename(fname, '.*')
40
+ search_me = ::File.expand_path(
41
+ ::File.join(::File.dirname(fname), dir, '**', '*.rb'))
42
+
43
+ Dir.glob(search_me).sort.each {|rb| require rb}
44
+ end
45
+
46
+ # The entry point. Pass the html string as content, with an optional
47
+ # href to use for resolving relative URLs.
48
+ def self.parse(content, href=nil)
49
+ return CelerityParser::HtmlParser.parse(content, href)
50
+ end
51
+
52
+ end # module CelerityParser
53
+
54
+ CelerityParser.require_all_libs_relative_to(__FILE__)
55
+
56
+ # EOF
data/test/helper.rb ADDED
@@ -0,0 +1,5 @@
1
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib')))
2
+ require 'rubygems'
3
+ require 'test/unit'
4
+ require 'mocha'
5
+ gem 'jarib-celerity'
File without changes
@@ -0,0 +1,39 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'helper'))
2
+ require 'celerity_parser/html_node'
3
+
4
+ class TestHtmlNode < Test::Unit::TestCase
5
+ include CelerityParser
6
+
7
+ def setup
8
+ @java_node = stub("(java_node)")
9
+ @node = HtmlNode.new(@java_node)
10
+ end
11
+
12
+ def test_search
13
+ xpath = "//example/xpath"
14
+ @java_node.expects(:get_by_xpath).with(xpath).returns([])
15
+
16
+ @node.search(xpath)
17
+ end
18
+
19
+ def test_wrap
20
+ HtmlNode.any_instance.expects(:initialize).with(@java_node)
21
+ HtmlNode.wrap(@java_node)
22
+ end
23
+
24
+ def test_text
25
+ @java_node.expects(:as_text).returns("")
26
+ @node.text
27
+ end
28
+
29
+ def test_value
30
+ @java_node.expects(:value).returns(nil)
31
+ @node.value
32
+ end
33
+
34
+ def test_html
35
+ @java_node.expects(:as_xml).returns("")
36
+ @node.html
37
+ end
38
+
39
+ end
@@ -0,0 +1,55 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'helper'))
2
+ require 'celerity_parser/html_parser'
3
+
4
+ class TestHtmlParser < Test::Unit::TestCase
5
+ include CelerityParser
6
+
7
+ def test_nil_content_raises
8
+ assert_raises(ArgumentError) do
9
+ HtmlParser.parse(nil)
10
+ end
11
+ end
12
+
13
+ def test_get_window
14
+ assert_not_nil HtmlParser.get_window("foo title")
15
+ end
16
+
17
+ def test_get_wrapped_content
18
+ content = "<html></html>"
19
+ assert_not_nil HtmlParser.get_wrapped_content(content)
20
+ end
21
+
22
+ def test_parse_simple_content
23
+ content = "<html><head></head><body></body></html>"
24
+ assert_not_nil HtmlParser.parse(content)
25
+ end
26
+
27
+ ###
28
+ # This is something of an integration test
29
+ def test_parse_to_xpath
30
+ content = <<-HTML
31
+ <html>
32
+ <head>
33
+ <title>HTML Page Title</title>
34
+ </head>
35
+ <body>
36
+ <ul>
37
+ <li>Foo</li>
38
+ <li>Bar</li>
39
+ </ul>
40
+ </body>
41
+ </html>
42
+ HTML
43
+
44
+ root_node = HtmlParser.parse(content)
45
+ flunk "Bad root node" if root_node.nil?
46
+
47
+ xpath = "//li"
48
+
49
+ search_results = root_node.search(xpath)
50
+
51
+ assert_equal 2, search_results.length
52
+ assert_equal ["Foo", "Bar"], search_results.map{|r| r.text}
53
+ end
54
+
55
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: regularfry-celerity_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: java
6
+ authors:
7
+ - Alex Young
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-06-22 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: jarib-celerity
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.6.10
24
+ version:
25
+ description:
26
+ email: alex@blackkettle.org
27
+ executables: []
28
+
29
+ extensions: []
30
+
31
+ extra_rdoc_files:
32
+ - LICENSE
33
+ - README.rdoc
34
+ files:
35
+ - .document
36
+ - .gitignore
37
+ - LICENSE
38
+ - README.rdoc
39
+ - Rakefile
40
+ - VERSION
41
+ - celerity_parser.gemspec
42
+ - lib/celerity_parser.rb
43
+ - lib/celerity_parser/html_node.rb
44
+ - lib/celerity_parser/html_parser.rb
45
+ - test/helper.rb
46
+ - test/test_celerity_parser.rb
47
+ - test/test_html_node.rb
48
+ - test/test_html_parser.rb
49
+ has_rdoc: false
50
+ homepage: http://github.com/regularfry/celerity_parser
51
+ post_install_message:
52
+ rdoc_options:
53
+ - --charset=UTF-8
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: "0"
61
+ version:
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: "0"
67
+ version:
68
+ requirements: []
69
+
70
+ rubyforge_project: celerity_parser
71
+ rubygems_version: 1.2.0
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Thin wrapper around a native Java HTML parser
75
+ test_files:
76
+ - test/helper.rb
77
+ - test/test_html_node.rb
78
+ - test/test_html_parser.rb
79
+ - test/test_celerity_parser.rb