rdig 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,25 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2
+ <html>
3
+ <head>
4
+ <meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
5
+ <meta name="DC.title" content="Title from DC meta data" />
6
+ <title>Sample Title</title>
7
+ </head>
8
+ <body>
9
+ <h1 class="title">Sample Title in h1</h1>
10
+ <a href="http://test.host/outside.html">Affe</a>
11
+ <h1>Some sample <span>text</span></h1>
12
+ <div id="content">
13
+ <a href="/inside.html">Affe</a>
14
+ <!-- some comment -->
15
+ <p>
16
+ Real content is here.</p>
17
+ </div>
18
+ <!-- another comment
19
+ here -->
20
+ <p>Some footer</p>
21
+ <a href="#foo">Top</a>
22
+ <a href="/footer.html">Affe</a>
23
+ </body>
24
+ </html>
25
+
@@ -0,0 +1,15 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2
+ <html>
3
+ <head>
4
+ <meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
5
+ <title>Sample &amp; Title</title>
6
+ </head>
7
+ <body>
8
+ <h1>Some &gt; Links</h1>
9
+ <p>don't&nbsp;break me!</p>
10
+ <a href="http://test.host/affe.html?b=a&amp;c=d">Affe</a>
11
+ <a href="http://test.host/affe2.html?b=a&c=d">Affe</a>
12
+ <h1>&Uuml;ml&auml;uts</h1>
13
+ <p>hei&szlig; hier &#223;</p>
14
+ </body>
15
+ </html>
@@ -0,0 +1,17 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2
+ <html>
3
+ <head>
4
+ <meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
5
+ <title>Sample Title</title>
6
+ </head>
7
+ <body>
8
+ <h1>A Link</h1>
9
+ <a href="http://test.host/affe.html">Affe</a>
10
+ <h1>Some sample <span>text</span></h1>
11
+ <!-- invalid markup follows -->
12
+ <p>Lorem<br>
13
+ <!-- another comment
14
+ here -->
15
+ ipsum
16
+ </body>
17
+ </html>
@@ -0,0 +1,18 @@
1
+ require 'test/unit'
2
+ require 'rdig'
3
+ #File.expand_path(File.dirname(__FILE__) + "/../init.rb")
4
+ # require File.expand_path(File.dirname(__FILE__) + "/../init.rb")
5
+
6
+ module TestHelper
7
+ include RDig
8
+
9
+ def read_fixture(path)
10
+ File.open("#{File.expand_path(File.dirname(__FILE__))}/fixtures/#{path}") { |f|
11
+ f.read
12
+ }
13
+ end
14
+
15
+ def html_doc(name)
16
+ read_fixture("html/#{name}.html")
17
+ end
18
+ end
@@ -0,0 +1,23 @@
1
+ require 'test_helper'
2
+ class ETagFilterTest < Test::Unit::TestCase
3
+ include TestHelper, RDig
4
+
5
+ def setup
6
+ @filter = ETagFilter.new
7
+ end
8
+
9
+ def test_add
10
+ d0 = OpenStruct.new(:etag => nil)
11
+ assert @filter.apply(d0)
12
+
13
+ d1 = OpenStruct.new(:etag => 'abc1234')
14
+ assert @filter.apply(d1)
15
+ assert !@filter.apply(d1)
16
+
17
+ d2 = OpenStruct.new(:etag => 'abc1235')
18
+ assert @filter.apply(d2)
19
+ assert !@filter.apply(d2)
20
+ assert !@filter.apply(d1)
21
+ end
22
+
23
+ end
@@ -0,0 +1,64 @@
1
+ require 'test_helper'
2
+ class HtmlContentExtractorTest < Test::Unit::TestCase
3
+ include TestHelper
4
+
5
+ def setup
6
+ @extractor = ContentExtractors::HtmlContentExtractor
7
+ @nbsp = [160].pack('U') # non breaking space
8
+ @config_backup = RDig.config.content_extraction.html.clone
9
+ end
10
+
11
+ def teardown
12
+ RDig.config.content_extraction.html = @config_backup
13
+ end
14
+
15
+ def test_simple
16
+ result = @extractor.process(html_doc('simple'))
17
+ assert_not_nil result
18
+ assert_equal 'Sample Title', result[:title]
19
+ assert_not_nil result[:content]
20
+ assert_not_nil result[:links]
21
+ assert_equal 1, result[:links].size
22
+ assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
23
+ assert_equal 'http://test.host/affe.html', result[:links].first
24
+ end
25
+
26
+ def test_entities
27
+ result = @extractor.process(html_doc('entities'))
28
+ assert_equal 'Sample & Title', result[:title]
29
+ assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
30
+ assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
31
+ assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
32
+ end
33
+
34
+ def test_custom_content_element
35
+ RDig.configuration do |config|
36
+ config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
37
+ tagsoup.find('h1', :attrs => { 'class', 'title' })
38
+ end
39
+ config.content_extraction.html.content_tag_selector = lambda do |tagsoup|
40
+ tagsoup.find('div', :attrs => { 'id', 'content' })
41
+ end
42
+ end
43
+ result = @extractor.process(html_doc('custom_tag_selectors'))
44
+ assert_equal 'Sample Title in h1', result[:title]
45
+ assert_equal 'Affe Real content is here.', result[:content]
46
+ # check if links are collected outside the content tag, too:
47
+ assert_equal 3, result[:links].size
48
+ assert_equal 'http://test.host/outside.html', result[:links].first
49
+ assert_equal '/inside.html', result[:links][1]
50
+ assert_equal '/footer.html', result[:links][2]
51
+ end
52
+
53
+ def test_title_from_dcmeta
54
+ RDig.configuration do |config|
55
+ config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
56
+ tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
57
+ end
58
+ end
59
+ result = @extractor.process(html_doc('custom_tag_selectors'))
60
+ assert_equal 'Title from DC meta data', result[:title]
61
+ end
62
+
63
+ end
64
+
@@ -0,0 +1,96 @@
1
+ require 'test_helper'
2
+ class UrlFilterTest < Test::Unit::TestCase
3
+ include TestHelper, RDig
4
+
5
+ def setup
6
+ end
7
+
8
+ # test a chain configured with direct parameters
9
+ def test_filterchain
10
+ cfg = [
11
+ { UrlFilters::UrlInclusionFilter => /.+html$/ },
12
+ { :hostname_filter => 'test.host' }
13
+ ]
14
+ chain = UrlFilters::FilterChain.new(cfg)
15
+
16
+ assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
17
+ assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
18
+ assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
19
+ end
20
+
21
+ # test default chain config
22
+ def test_default_filterchain
23
+ chain = UrlFilters::FilterChain.new(RDig.filter_chain)
24
+ assert_nil chain.apply(Document.new("http://www.example.com/affe.htm"))
25
+ assert_not_nil chain.apply(Document.new("http://localhost:3000/affe.html"))
26
+ assert_nil chain.apply(Document.new("http://localhost.com/affe.html"))
27
+ end
28
+
29
+ # check lookup of chain parameters from config
30
+ def test_filterchain_config
31
+ RDig.configuration do |conf|
32
+ conf.crawler.include_patterns = /.+html$/
33
+ conf.crawler.include_hosts = 'test.host'
34
+ end
35
+ cfg = [
36
+ { UrlFilters::UrlInclusionFilter => :include_patterns },
37
+ { :hostname_filter => :include_hosts }
38
+ ]
39
+ chain = UrlFilters::FilterChain.new(cfg)
40
+
41
+ assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
42
+ assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
43
+ assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
44
+ end
45
+
46
+ def test_urlpattern_filter
47
+ f = UrlFilters::UrlInclusionFilter.new(/.*\.html$/)
48
+ assert_nil f.apply(Document.new("http://test.host/affe.htm"))
49
+ assert_not_nil f.apply(Document.new("http://test.host/affe.html"))
50
+ f = UrlFilters::UrlExclusionFilter.new([ /.*\.html$/, /.*\.aspx/ ])
51
+ assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
52
+ assert_nil f.apply(Document.new("http://test.host/affe.html"))
53
+ assert_nil f.apply(Document.new("http://test.host/affe.aspx"))
54
+ f = UrlFilters::UrlExclusionFilter.new([ /http:\/\/[^\/]+\/dir1/ ])
55
+ assert_nil f.apply(Document.new("http://test.host/dir1/affe.aspx"))
56
+ assert_not_nil f.apply(Document.new("http://test.host/dir2/dir1/affe.htm"))
57
+ assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
58
+ assert_not_nil f.apply(Document.new("http://test.host/dir2/affe.htm"))
59
+ f = UrlFilters::UrlExclusionFilter.new([ /\/dir1/ ])
60
+ assert_nil f.apply(Document.new("http://test.host/dir1/affe.aspx"))
61
+ assert_nil f.apply(Document.new("http://test.host/dir2/dir1/affe.htm"))
62
+ assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
63
+ assert_not_nil f.apply(Document.new("http://test.host/dir2/affe.htm"))
64
+ end
65
+
66
+ def test_hostname_filter
67
+ include_hosts = [ 'test.host', 'localhost' ]
68
+ assert_nil UrlFilters.hostname_filter(Document.new('http://google.com/'), include_hosts)
69
+ assert_not_nil UrlFilters.hostname_filter(Document.new('http://test.host/file.html'), include_hosts)
70
+ assert_not_nil UrlFilters.hostname_filter(Document.new('http://localhost/file.html'), include_hosts)
71
+ end
72
+
73
+ def test_fix_relative_uri
74
+ doc = Document.new('http://test.host/dir/file.html')
75
+ assert_equal('http://test.host/dir/another.html',
76
+ UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
77
+ assert_equal('http://test.host/dir/../another.html',
78
+ UrlFilters.fix_relative_uri(Document.new('../another.html', doc.uri)).uri.to_s)
79
+ assert_equal('http://test.host/dir/another.html',
80
+ UrlFilters.fix_relative_uri(Document.new('/dir/another.html', doc.uri)).uri.to_s)
81
+ assert_equal('http://test.host/dir/another.html',
82
+ UrlFilters.fix_relative_uri(Document.new('http://test.host/dir/another.html', doc.uri)).uri.to_s)
83
+ assert_equal('HTTP://test.host/dir/another.html',
84
+ UrlFilters.fix_relative_uri(Document.new('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
85
+ doc = Document.new('https://test.host/dir/')
86
+ assert_equal('https://test.host/dir/another.html',
87
+ UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
88
+ doc = Document.new('https://test.host/')
89
+ assert_equal('https://test.host/another.html',
90
+ UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
91
+ doc = Document.new('https://test.host')
92
+ assert_equal('https://test.host/another.html',
93
+ UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
94
+ end
95
+ end
96
+
metadata ADDED
@@ -0,0 +1,102 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.8.10
3
+ specification_version: 1
4
+ name: rdig
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2006-03-25
8
+ summary: Ruby based web site indexing and searching library.
9
+ require_paths:
10
+ - lib
11
+ email: jk@jkraemer.net
12
+ homepage: http://rdig.rubyforge.org/
13
+ rubyforge_project: rdig
14
+ description: "RDig provides an HTTP crawler and content extraction utilities to help building
15
+ a site search for web sites or intranets. Internally, Ferret is used for the
16
+ full text indexing. After creating a config file for your site, the index can
17
+ be built with a single call to rdig."
18
+ autorequire:
19
+ default_executable: rdig
20
+ bindir: bin
21
+ has_rdoc: true
22
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
23
+ requirements:
24
+ -
25
+ - ">"
26
+ - !ruby/object:Gem::Version
27
+ version: 0.0.0
28
+ version:
29
+ platform: ruby
30
+ authors:
31
+ - Jens Kraemer
32
+ files:
33
+ - bin/rdig
34
+ - lib/rdig
35
+ - lib/htmlentities
36
+ - lib/rdig.rb
37
+ - lib/rdig/http_client.rb
38
+ - lib/rdig/crawler.rb
39
+ - lib/rdig/search.rb
40
+ - lib/rdig/highlight.rb
41
+ - lib/rdig/index.rb
42
+ - lib/rdig/url_filters.rb
43
+ - lib/rdig/content_extractors.rb
44
+ - lib/htmlentities/CHANGES
45
+ - lib/htmlentities/COPYING
46
+ - lib/htmlentities/README
47
+ - lib/htmlentities/htmlentities.rb
48
+ - test/unit
49
+ - test/fixtures
50
+ - test/test_helper.rb
51
+ - test/unit/etag_filter_test.rb
52
+ - test/unit/url_filters_test.rb
53
+ - test/unit/html_content_extractor_test.rb
54
+ - test/fixtures/html
55
+ - test/fixtures/html/entities.html
56
+ - test/fixtures/html/simple.html
57
+ - test/fixtures/html/custom_tag_selectors.html
58
+ - doc/examples
59
+ - doc/examples/config.rb
60
+ - LICENSE
61
+ - TODO
62
+ - CHANGES
63
+ - README
64
+ - install.rb
65
+ - rakefile
66
+ test_files: []
67
+ rdoc_options:
68
+ - "--title"
69
+ - "Rake -- Ruby Make"
70
+ - "--main"
71
+ - README
72
+ - "--line-numbers"
73
+ extra_rdoc_files:
74
+ - README
75
+ - CHANGES
76
+ - LICENSE
77
+ - TODO
78
+ executables:
79
+ - rdig
80
+ extensions: []
81
+ requirements: []
82
+ dependencies:
83
+ - !ruby/object:Gem::Dependency
84
+ name: ferret
85
+ version_requirement:
86
+ version_requirements: !ruby/object:Gem::Version::Requirement
87
+ requirements:
88
+ -
89
+ - ">="
90
+ - !ruby/object:Gem::Version
91
+ version: 0.3.2
92
+ version:
93
+ - !ruby/object:Gem::Dependency
94
+ name: rubyful_soup
95
+ version_requirement:
96
+ version_requirements: !ruby/object:Gem::Version::Requirement
97
+ requirements:
98
+ -
99
+ - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: 1.0.4
102
+ version: