rdig 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2
+ <html>
3
+ <head>
4
+ <meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
5
+ <meta name="DC.title" content="Title from DC meta data" />
6
+ <title>Sample Title</title>
7
+ </head>
8
+ <body>
9
+ <h1 class="title">Sample Title in h1</h1>
10
+ <a href="http://test.host/outside.html">Affe</a>
11
+ <h1>Some sample <span>text</span></h1>
12
+ <div id="content">
13
+ <a href="/inside.html">Affe</a>
14
+ <!-- some comment -->
15
+ <p>
16
+ Real content is here.</p>
17
+ </div>
18
+ <!-- another comment
19
+ here -->
20
+ <p>Some footer</p>
21
+ <a href="#foo">Top</a>
22
+ <a href="/footer.html">Affe</a>
23
+ </body>
24
+ </html>
25
+
@@ -0,0 +1,15 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2
+ <html>
3
+ <head>
4
+ <meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
5
+ <title>Sample &amp; Title</title>
6
+ </head>
7
+ <body>
8
+ <h1>Some &gt; Links</h1>
9
+ <p>don't&nbsp;break me!</p>
10
+ <a href="http://test.host/affe.html?b=a&amp;c=d">Affe</a>
11
+ <a href="http://test.host/affe2.html?b=a&c=d">Affe</a>
12
+ <h1>&Uuml;ml&auml;uts</h1>
13
+ <p>hei&szlig; hier &#223;</p>
14
+ </body>
15
+ </html>
@@ -0,0 +1,17 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2
+ <html>
3
+ <head>
4
+ <meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
5
+ <title>Sample Title</title>
6
+ </head>
7
+ <body>
8
+ <h1>A Link</h1>
9
+ <a href="http://test.host/affe.html">Affe</a>
10
+ <h1>Some sample <span>text</span></h1>
11
+ <!-- invalid markup follows -->
12
+ <p>Lorem<br>
13
+ <!-- another comment
14
+ here -->
15
+ ipsum
16
+ </body>
17
+ </html>
@@ -0,0 +1,18 @@
1
+ require 'test/unit'
2
+ require 'rdig'
3
+ #File.expand_path(File.dirname(__FILE__) + "/../init.rb")
4
+ # require File.expand_path(File.dirname(__FILE__) + "/../init.rb")
5
+
6
+ module TestHelper
7
+ include RDig
8
+
9
+ def read_fixture(path)
10
+ File.open("#{File.expand_path(File.dirname(__FILE__))}/fixtures/#{path}") { |f|
11
+ f.read
12
+ }
13
+ end
14
+
15
+ def html_doc(name)
16
+ read_fixture("html/#{name}.html")
17
+ end
18
+ end
@@ -0,0 +1,23 @@
1
+ require 'test_helper'
2
+ class ETagFilterTest < Test::Unit::TestCase
3
+ include TestHelper, RDig
4
+
5
+ def setup
6
+ @filter = ETagFilter.new
7
+ end
8
+
9
+ def test_add
10
+ d0 = OpenStruct.new(:etag => nil)
11
+ assert @filter.apply(d0)
12
+
13
+ d1 = OpenStruct.new(:etag => 'abc1234')
14
+ assert @filter.apply(d1)
15
+ assert !@filter.apply(d1)
16
+
17
+ d2 = OpenStruct.new(:etag => 'abc1235')
18
+ assert @filter.apply(d2)
19
+ assert !@filter.apply(d2)
20
+ assert !@filter.apply(d1)
21
+ end
22
+
23
+ end
@@ -0,0 +1,64 @@
1
+ require 'test_helper'
2
+ class HtmlContentExtractorTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @extractor = ContentExtractors::HtmlContentExtractor
7
+ @nbsp = [160].pack('U') # non breaking space
8
+ @config_backup = RDig.config.content_extraction.html.clone
9
+ end
10
+
11
+ def teardown
12
+ RDig.config.content_extraction.html = @config_backup
13
+ end
14
+
15
+ def test_simple
16
+ result = @extractor.process(html_doc('simple'))
17
+ assert_not_nil result
18
+ assert_equal 'Sample Title', result[:title]
19
+ assert_not_nil result[:content]
20
+ assert_not_nil result[:links]
21
+ assert_equal 1, result[:links].size
22
+ assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
23
+ assert_equal 'http://test.host/affe.html', result[:links].first
24
+ end
25
+
26
+ def test_entities
27
+ result = @extractor.process(html_doc('entities'))
28
+ assert_equal 'Sample & Title', result[:title]
29
+ assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
30
+ assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
31
+ assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
32
+ end
33
+
34
+ def test_custom_content_element
35
+ RDig.configuration do |config|
36
+ config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
37
+ tagsoup.find('h1', :attrs => { 'class', 'title' })
38
+ end
39
+ config.content_extraction.html.content_tag_selector = lambda do |tagsoup|
40
+ tagsoup.find('div', :attrs => { 'id', 'content' })
41
+ end
42
+ end
43
+ result = @extractor.process(html_doc('custom_tag_selectors'))
44
+ assert_equal 'Sample Title in h1', result[:title]
45
+ assert_equal 'Affe Real content is here.', result[:content]
46
+ # check if links are collected outside the content tag, too:
47
+ assert_equal 3, result[:links].size
48
+ assert_equal 'http://test.host/outside.html', result[:links].first
49
+ assert_equal '/inside.html', result[:links][1]
50
+ assert_equal '/footer.html', result[:links][2]
51
+ end
52
+
53
+ def test_title_from_dcmeta
54
+ RDig.configuration do |config|
55
+ config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
56
+ tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
57
+ end
58
+ end
59
+ result = @extractor.process(html_doc('custom_tag_selectors'))
60
+ assert_equal 'Title from DC meta data', result[:title]
61
+ end
62
+
63
+ end
64
+
@@ -0,0 +1,96 @@
1
+ require 'test_helper'
2
+ class UrlFilterTest < Test::Unit::TestCase
3
+ include TestHelper, RDig
4
+
5
+ def setup
6
+ end
7
+
8
+ # test a chain configured with direct parameters
9
+ def test_filterchain
10
+ cfg = [
11
+ { UrlFilters::UrlInclusionFilter => /.+html$/ },
12
+ { :hostname_filter => 'test.host' }
13
+ ]
14
+ chain = UrlFilters::FilterChain.new(cfg)
15
+
16
+ assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
17
+ assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
18
+ assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
19
+ end
20
+
21
+ # test default chain config
22
+ def test_default_filterchain
23
+ chain = UrlFilters::FilterChain.new(RDig.filter_chain)
24
+ assert_nil chain.apply(Document.new("http://www.example.com/affe.htm"))
25
+ assert_not_nil chain.apply(Document.new("http://localhost:3000/affe.html"))
26
+ assert_nil chain.apply(Document.new("http://localhost.com/affe.html"))
27
+ end
28
+
29
+ # check lookup of chain parameters from config
30
+ def test_filterchain_config
31
+ RDig.configuration do |conf|
32
+ conf.crawler.include_patterns = /.+html$/
33
+ conf.crawler.include_hosts = 'test.host'
34
+ end
35
+ cfg = [
36
+ { UrlFilters::UrlInclusionFilter => :include_patterns },
37
+ { :hostname_filter => :include_hosts }
38
+ ]
39
+ chain = UrlFilters::FilterChain.new(cfg)
40
+
41
+ assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
42
+ assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
43
+ assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
44
+ end
45
+
46
+ def test_urlpattern_filter
47
+ f = UrlFilters::UrlInclusionFilter.new(/.*\.html$/)
48
+ assert_nil f.apply(Document.new("http://test.host/affe.htm"))
49
+ assert_not_nil f.apply(Document.new("http://test.host/affe.html"))
50
+ f = UrlFilters::UrlExclusionFilter.new([ /.*\.html$/, /.*\.aspx/ ])
51
+ assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
52
+ assert_nil f.apply(Document.new("http://test.host/affe.html"))
53
+ assert_nil f.apply(Document.new("http://test.host/affe.aspx"))
54
+ f = UrlFilters::UrlExclusionFilter.new([ /http:\/\/[^\/]+\/dir1/ ])
55
+ assert_nil f.apply(Document.new("http://test.host/dir1/affe.aspx"))
56
+ assert_not_nil f.apply(Document.new("http://test.host/dir2/dir1/affe.htm"))
57
+ assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
58
+ assert_not_nil f.apply(Document.new("http://test.host/dir2/affe.htm"))
59
+ f = UrlFilters::UrlExclusionFilter.new([ /\/dir1/ ])
60
+ assert_nil f.apply(Document.new("http://test.host/dir1/affe.aspx"))
61
+ assert_nil f.apply(Document.new("http://test.host/dir2/dir1/affe.htm"))
62
+ assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
63
+ assert_not_nil f.apply(Document.new("http://test.host/dir2/affe.htm"))
64
+ end
65
+
66
+ def test_hostname_filter
67
+ include_hosts = [ 'test.host', 'localhost' ]
68
+ assert_nil UrlFilters.hostname_filter(Document.new('http://google.com/'), include_hosts)
69
+ assert_not_nil UrlFilters.hostname_filter(Document.new('http://test.host/file.html'), include_hosts)
70
+ assert_not_nil UrlFilters.hostname_filter(Document.new('http://localhost/file.html'), include_hosts)
71
+ end
72
+
73
+ def test_fix_relative_uri
74
+ doc = Document.new('http://test.host/dir/file.html')
75
+ assert_equal('http://test.host/dir/another.html',
76
+ UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
77
+ assert_equal('http://test.host/dir/../another.html',
78
+ UrlFilters.fix_relative_uri(Document.new('../another.html', doc.uri)).uri.to_s)
79
+ assert_equal('http://test.host/dir/another.html',
80
+ UrlFilters.fix_relative_uri(Document.new('/dir/another.html', doc.uri)).uri.to_s)
81
+ assert_equal('http://test.host/dir/another.html',
82
+ UrlFilters.fix_relative_uri(Document.new('http://test.host/dir/another.html', doc.uri)).uri.to_s)
83
+ assert_equal('HTTP://test.host/dir/another.html',
84
+ UrlFilters.fix_relative_uri(Document.new('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
85
+ doc = Document.new('https://test.host/dir/')
86
+ assert_equal('https://test.host/dir/another.html',
87
+ UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
88
+ doc = Document.new('https://test.host/')
89
+ assert_equal('https://test.host/another.html',
90
+ UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
91
+ doc = Document.new('https://test.host')
92
+ assert_equal('https://test.host/another.html',
93
+ UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
94
+ end
95
+ end
96
+
metadata ADDED
@@ -0,0 +1,102 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.10
3
+ specification_version: 1
4
+ name: rdig
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2006-03-25
8
+ summary: Ruby based web site indexing and searching library.
9
+ require_paths:
10
+ - lib
11
+ email: jk@jkraemer.net
12
+ homepage: http://rdig.rubyforge.org/
13
+ rubyforge_project: rdig
14
+ description: "RDig provides an HTTP crawler and content extraction utilities to help building
15
+ a site search for web sites or intranets. Internally, Ferret is used for the
16
+ full text indexing. After creating a config file for your site, the index can
17
+ be built with a single call to rdig."
18
+ autorequire:
19
+ default_executable: rdig
20
+ bindir: bin
21
+ has_rdoc: true
22
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
23
+ requirements:
24
+ -
25
+ - ">"
26
+ - !ruby/object:Gem::Version
27
+ version: 0.0.0
28
+ version:
29
+ platform: ruby
30
+ authors:
31
+ - Jens Kraemer
32
+ files:
33
+ - bin/rdig
34
+ - lib/rdig
35
+ - lib/htmlentities
36
+ - lib/rdig.rb
37
+ - lib/rdig/http_client.rb
38
+ - lib/rdig/crawler.rb
39
+ - lib/rdig/search.rb
40
+ - lib/rdig/highlight.rb
41
+ - lib/rdig/index.rb
42
+ - lib/rdig/url_filters.rb
43
+ - lib/rdig/content_extractors.rb
44
+ - lib/htmlentities/CHANGES
45
+ - lib/htmlentities/COPYING
46
+ - lib/htmlentities/README
47
+ - lib/htmlentities/htmlentities.rb
48
+ - test/unit
49
+ - test/fixtures
50
+ - test/test_helper.rb
51
+ - test/unit/etag_filter_test.rb
52
+ - test/unit/url_filters_test.rb
53
+ - test/unit/html_content_extractor_test.rb
54
+ - test/fixtures/html
55
+ - test/fixtures/html/entities.html
56
+ - test/fixtures/html/simple.html
57
+ - test/fixtures/html/custom_tag_selectors.html
58
+ - doc/examples
59
+ - doc/examples/config.rb
60
+ - LICENSE
61
+ - TODO
62
+ - CHANGES
63
+ - README
64
+ - install.rb
65
+ - rakefile
66
+ test_files: []
67
+ rdoc_options:
68
+ - "--title"
69
+ - "Rake -- Ruby Make"
70
+ - "--main"
71
+ - README
72
+ - "--line-numbers"
73
+ extra_rdoc_files:
74
+ - README
75
+ - CHANGES
76
+ - LICENSE
77
+ - TODO
78
+ executables:
79
+ - rdig
80
+ extensions: []
81
+ requirements: []
82
+ dependencies:
83
+ - !ruby/object:Gem::Dependency
84
+ name: ferret
85
+ version_requirement:
86
+ version_requirements: !ruby/object:Gem::Version::Requirement
87
+ requirements:
88
+ -
89
+ - ">="
90
+ - !ruby/object:Gem::Version
91
+ version: 0.3.2
92
+ version:
93
+ - !ruby/object:Gem::Dependency
94
+ name: rubyful_soup
95
+ version_requirement:
96
+ version_requirements: !ruby/object:Gem::Version::Requirement
97
+ requirements:
98
+ -
99
+ - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: 1.0.4
102
+ version: