regularfry-celerity_parser 0.1.1-java

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Alex Young
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,57 @@
1
+ celerity_parser
2
+ by Alex Young
3
+ http://github.com/regularfry/celerity_parser
4
+
5
+ == DESCRIPTION:
6
+
7
+ A very thin wrapper around HtmlUnit's HTML parser to allow xpath searches of
8
+ HTML documents for JRuby. At the moment there's not much here. I don't know yet
9
+ whether it's worth expanding the API to a full nokogiri/hpricot-style
10
+ implementation. What's here works for what I need for now.
11
+
12
+ == FEATURES/PROBLEMS:
13
+
14
+ * Reuses HtmlUnit's parser as wrapped by celerity to provide a stable HTML
15
+ * parsing capability.
16
+
17
+ == SYNOPSIS:
18
+
19
+ Basic use:
20
+
21
+ root_node = CelerityParser.parse(html_content)
22
+ found_elements = root_node.search("//html/head/title")
23
+ found_elements.first.text # => "Html page title"
24
+
25
+
26
+ == REQUIREMENTS:
27
+
28
+ $ jruby -S gem install jarib-celerity --source=http://gems.github.com
29
+
30
+ == INSTALL:
31
+
32
+ $ jruby -S gem install celerity_parser --source=http://gems.github.com
33
+
34
+ == LICENSE:
35
+
36
+ (The MIT License)
37
+
38
+ Copyright (c) 2009 Alex Young
39
+
40
+ Permission is hereby granted, free of charge, to any person obtaining
41
+ a copy of this software and associated documentation files (the
42
+ 'Software'), to deal in the Software without restriction, including
43
+ without limitation the rights to use, copy, modify, merge, publish,
44
+ distribute, sublicense, and/or sell copies of the Software, and to
45
+ permit persons to whom the Software is furnished to do so, subject to
46
+ the following conditions:
47
+
48
+ The above copyright notice and this permission notice shall be
49
+ included in all copies or substantial portions of the Software.
50
+
51
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
52
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
53
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
54
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
55
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
56
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
57
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,60 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "celerity_parser"
8
+ gem.summary = %Q{Thin wrapper around a native Java HTML parser}
9
+ gem.email = "alex@blackkettle.org"
10
+ gem.homepage = "http://github.com/regularfry/celerity_parser"
11
+ gem.authors = ["Alex Young"]
12
+ gem.rubyforge_project = "celerity_parser"
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ gem.add_dependency 'jarib-celerity', '>= 0.0.6.10'
15
+ gem.platform = "java"
16
+ end
17
+
18
+ Jeweler::RubyforgeTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
21
+ end
22
+
23
+ require 'rake/testtask'
24
+ Rake::TestTask.new(:test) do |test|
25
+ test.libs << 'lib' << 'test'
26
+ test.pattern = 'test/**/test_*.rb'
27
+ test.verbose = true
28
+ end
29
+
30
+ begin
31
+ require 'rcov/rcovtask'
32
+ Rcov::RcovTask.new do |test|
33
+ test.libs << 'test'
34
+ test.pattern = 'test/**/*_test.rb'
35
+ test.verbose = true
36
+ end
37
+ rescue LoadError
38
+ task :rcov do
39
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
40
+ end
41
+ end
42
+
43
+
44
+ task :default => :test
45
+
46
+ require 'rake/rdoctask'
47
+ Rake::RDocTask.new do |rdoc|
48
+ if File.exist?('VERSION.yml')
49
+ config = YAML.load(File.read('VERSION.yml'))
50
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
51
+ else
52
+ version = ""
53
+ end
54
+
55
+ rdoc.rdoc_dir = 'rdoc'
56
+ rdoc.title = "celerity_parser #{version}"
57
+ rdoc.rdoc_files.include('README*')
58
+ rdoc.rdoc_files.include('lib/**/*.rb')
59
+ end
60
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.1
@@ -0,0 +1,57 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{celerity_parser}
5
+ s.version = "0.1.1"
6
+ s.platform = %q{java}
7
+
8
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
9
+ s.authors = ["Alex Young"]
10
+ s.date = %q{2009-06-22}
11
+ s.email = %q{alex@blackkettle.org}
12
+ s.extra_rdoc_files = [
13
+ "LICENSE",
14
+ "README.rdoc"
15
+ ]
16
+ s.files = [
17
+ ".document",
18
+ ".gitignore",
19
+ "LICENSE",
20
+ "README.rdoc",
21
+ "Rakefile",
22
+ "VERSION",
23
+ "celerity_parser.gemspec",
24
+ "lib/celerity_parser.rb",
25
+ "lib/celerity_parser/html_node.rb",
26
+ "lib/celerity_parser/html_parser.rb",
27
+ "test/helper.rb",
28
+ "test/test_celerity_parser.rb",
29
+ "test/test_html_node.rb",
30
+ "test/test_html_parser.rb"
31
+ ]
32
+ s.homepage = %q{http://github.com/regularfry/celerity_parser}
33
+ s.rdoc_options = ["--charset=UTF-8"]
34
+ s.require_paths = ["lib"]
35
+ s.rubyforge_project = %q{celerity_parser}
36
+ s.rubygems_version = %q{1.3.3}
37
+ s.summary = %q{Thin wrapper around a native Java HTML parser}
38
+ s.test_files = [
39
+ "test/helper.rb",
40
+ "test/test_html_node.rb",
41
+ "test/test_html_parser.rb",
42
+ "test/test_celerity_parser.rb"
43
+ ]
44
+
45
+ if s.respond_to? :specification_version then
46
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
47
+ s.specification_version = 3
48
+
49
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
50
+ s.add_runtime_dependency(%q<jarib-celerity>, [">= 0.0.6.10"])
51
+ else
52
+ s.add_dependency(%q<jarib-celerity>, [">= 0.0.6.10"])
53
+ end
54
+ else
55
+ s.add_dependency(%q<jarib-celerity>, [">= 0.0.6.10"])
56
+ end
57
+ end
@@ -0,0 +1,47 @@
1
+ module CelerityParser
2
+
3
+ class HtmlNode# {{{
4
+
5
+ # This extra entry point will allow me to insert different Node
6
+ # classes in the future if necessary.
7
+ def self.wrap(java_node)
8
+ new(java_node)
9
+ end
10
+
11
+ # Not called by user code.
12
+ def initialize(java_node)
13
+ @java_node = java_node
14
+ end
15
+
16
+ # Perform an xpath search from this node and return an array
17
+ # of HtmlNodes representing the results.
18
+ def search(xpath)
19
+ wrap_java_nodes(@java_node.get_by_xpath(xpath))
20
+ end
21
+
22
+ # The innerText of the node.
23
+ def text
24
+ @java_node.as_text
25
+ end
26
+
27
+ # HtmlNodes represent both tags and attributes, but attributes
28
+ # have value rather than text.
29
+ def value
30
+ @java_node.value
31
+ end
32
+
33
+ # The XHTML representation of this node.
34
+ def html
35
+ @java_node.as_xml
36
+ end
37
+
38
+ def wrap_java_nodes(java_results)
39
+ java_results.map do |java_node|
40
+ HtmlNode.wrap(java_node)
41
+ end
42
+ end
43
+ private :wrap_java_nodes
44
+
45
+ end # }}}
46
+
47
+ end
@@ -0,0 +1,24 @@
1
+ require 'celerity_parser/html_node'
2
+
3
+ require 'celerity'
4
+
5
+ module CelerityParser
6
+ class HtmlParser# {{{
7
+ def self.get_window(title)
8
+ client = Java.com.gargoylesoftware.htmlunit.WebClient.new()
9
+ return Java.com.gargoylesoftware.htmlunit.TopLevelWindow.new(title, client)
10
+ end
11
+
12
+ def self.get_wrapped_content(content, href=nil)
13
+ url = java.net.URL.new(href || "http://example.com")
14
+ return Java.com.gargoylesoftware.htmlunit.StringWebResponse.new(content, url)
15
+ end
16
+
17
+ def self.parse(content, href=nil)
18
+ raise ArgumentError.new("Nil content!") if !content
19
+ window = self.get_window("window")
20
+ wrapped_content = self.get_wrapped_content(content, href)
21
+ return HtmlNode.wrap(Java.com.gargoylesoftware.htmlunit.html.HTMLParser.parse(wrapped_content, window))
22
+ end
23
+ end# }}}
24
+ end
@@ -0,0 +1,56 @@
1
+ require 'celerity'
2
+
3
+ module CelerityParser
4
+
5
+ # :stopdoc:
6
+ VERSION = '1.0.0'
7
+ LIBPATH = ::File.expand_path(::File.dirname(__FILE__)) + ::File::SEPARATOR
8
+ PATH = ::File.dirname(LIBPATH) + ::File::SEPARATOR
9
+ # :startdoc:
10
+
11
+ # Returns the version string for the library.
12
+ #
13
+ def self.version
14
+ VERSION
15
+ end
16
+
17
+ # Returns the library path for the module. If any arguments are given,
18
+ # they will be joined to the end of the libray path using
19
+ # <tt>File.join</tt>.
20
+ #
21
+ def self.libpath( *args )
22
+ args.empty? ? LIBPATH : ::File.join(LIBPATH, args.flatten)
23
+ end
24
+
25
+ # Returns the lpath for the module. If any arguments are given,
26
+ # they will be joined to the end of the path using
27
+ # <tt>File.join</tt>.
28
+ #
29
+ def self.path( *args )
30
+ args.empty? ? PATH : ::File.join(PATH, args.flatten)
31
+ end
32
+
33
+ # Utility method used to require all files ending in .rb that lie in the
34
+ # directory below this file that has the same name as the filename passed
35
+ # in. Optionally, a specific _directory_ name can be passed in such that
36
+ # the _filename_ does not have to be equivalent to the directory.
37
+ #
38
+ def self.require_all_libs_relative_to( fname, dir = nil )
39
+ dir ||= ::File.basename(fname, '.*')
40
+ search_me = ::File.expand_path(
41
+ ::File.join(::File.dirname(fname), dir, '**', '*.rb'))
42
+
43
+ Dir.glob(search_me).sort.each {|rb| require rb}
44
+ end
45
+
46
+ # The entry point. Pass the html string as content, with an optional
47
+ # href to use for resolving relative URLs.
48
+ def self.parse(content, href=nil)
49
+ return CelerityParser::HtmlParser.parse(content, href)
50
+ end
51
+
52
+ end # module CelerityParser
53
+
54
+ CelerityParser.require_all_libs_relative_to(__FILE__)
55
+
56
+ # EOF
data/test/helper.rb ADDED
@@ -0,0 +1,5 @@
1
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib')))
2
+ require 'rubygems'
3
+ require 'test/unit'
4
+ require 'mocha'
5
+ gem 'jarib-celerity'
File without changes
@@ -0,0 +1,39 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'helper'))
2
+ require 'celerity_parser/html_node'
3
+
4
+ class TestHtmlNode < Test::Unit::TestCase
5
+ include CelerityParser
6
+
7
+ def setup
8
+ @java_node = stub("(java_node)")
9
+ @node = HtmlNode.new(@java_node)
10
+ end
11
+
12
+ def test_search
13
+ xpath = "//example/xpath"
14
+ @java_node.expects(:get_by_xpath).with(xpath).returns([])
15
+
16
+ @node.search(xpath)
17
+ end
18
+
19
+ def test_wrap
20
+ HtmlNode.any_instance.expects(:initialize).with(@java_node)
21
+ HtmlNode.wrap(@java_node)
22
+ end
23
+
24
+ def test_text
25
+ @java_node.expects(:as_text).returns("")
26
+ @node.text
27
+ end
28
+
29
+ def test_value
30
+ @java_node.expects(:value).returns(nil)
31
+ @node.value
32
+ end
33
+
34
+ def test_html
35
+ @java_node.expects(:as_xml).returns("")
36
+ @node.html
37
+ end
38
+
39
+ end
@@ -0,0 +1,55 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'helper'))
2
+ require 'celerity_parser/html_parser'
3
+
4
+ class TestHtmlParser < Test::Unit::TestCase
5
+ include CelerityParser
6
+
7
+ def test_nil_content_raises
8
+ assert_raises(ArgumentError) do
9
+ HtmlParser.parse(nil)
10
+ end
11
+ end
12
+
13
+ def test_get_window
14
+ assert_not_nil HtmlParser.get_window("foo title")
15
+ end
16
+
17
+ def test_get_wrapped_content
18
+ content = "<html></html>"
19
+ assert_not_nil HtmlParser.get_wrapped_content(content)
20
+ end
21
+
22
+ def test_parse_simple_content
23
+ content = "<html><head></head><body></body></html>"
24
+ assert_not_nil HtmlParser.parse(content)
25
+ end
26
+
27
+ ###
28
+ # This is something of an integration test
29
+ def test_parse_to_xpath
30
+ content = <<-HTML
31
+ <html>
32
+ <head>
33
+ <title>HTML Page Title</title>
34
+ </head>
35
+ <body>
36
+ <ul>
37
+ <li>Foo</li>
38
+ <li>Bar</li>
39
+ </ul>
40
+ </body>
41
+ </html>
42
+ HTML
43
+
44
+ root_node = HtmlParser.parse(content)
45
+ flunk "Bad root node" if root_node.nil?
46
+
47
+ xpath = "//li"
48
+
49
+ search_results = root_node.search(xpath)
50
+
51
+ assert_equal 2, search_results.length
52
+ assert_equal ["Foo", "Bar"], search_results.map{|r| r.text}
53
+ end
54
+
55
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: regularfry-celerity_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: java
6
+ authors:
7
+ - Alex Young
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-06-22 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: jarib-celerity
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.6.10
24
+ version:
25
+ description:
26
+ email: alex@blackkettle.org
27
+ executables: []
28
+
29
+ extensions: []
30
+
31
+ extra_rdoc_files:
32
+ - LICENSE
33
+ - README.rdoc
34
+ files:
35
+ - .document
36
+ - .gitignore
37
+ - LICENSE
38
+ - README.rdoc
39
+ - Rakefile
40
+ - VERSION
41
+ - celerity_parser.gemspec
42
+ - lib/celerity_parser.rb
43
+ - lib/celerity_parser/html_node.rb
44
+ - lib/celerity_parser/html_parser.rb
45
+ - test/helper.rb
46
+ - test/test_celerity_parser.rb
47
+ - test/test_html_node.rb
48
+ - test/test_html_parser.rb
49
+ has_rdoc: false
50
+ homepage: http://github.com/regularfry/celerity_parser
51
+ post_install_message:
52
+ rdoc_options:
53
+ - --charset=UTF-8
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: "0"
61
+ version:
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: "0"
67
+ version:
68
+ requirements: []
69
+
70
+ rubyforge_project: celerity_parser
71
+ rubygems_version: 1.2.0
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Thin wrapper around a native Java HTML parser
75
+ test_files:
76
+ - test/helper.rb
77
+ - test/test_html_node.rb
78
+ - test/test_html_parser.rb
79
+ - test/test_celerity_parser.rb