rdig 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +2 -0
- data/LICENSE +20 -0
- data/README +61 -0
- data/TODO +0 -0
- data/bin/rdig +32 -0
- data/doc/examples/config.rb +53 -0
- data/install.rb +89 -0
- data/lib/htmlentities/CHANGES +21 -0
- data/lib/htmlentities/COPYING +7 -0
- data/lib/htmlentities/README +15 -0
- data/lib/htmlentities/htmlentities.rb +281 -0
- data/lib/rdig.rb +243 -0
- data/lib/rdig/content_extractors.rb +145 -0
- data/lib/rdig/crawler.rb +176 -0
- data/lib/rdig/highlight.rb +24 -0
- data/lib/rdig/http_client.rb +22 -0
- data/lib/rdig/index.rb +39 -0
- data/lib/rdig/search.rb +77 -0
- data/lib/rdig/url_filters.rb +171 -0
- data/rakefile +325 -0
- data/test/fixtures/html/custom_tag_selectors.html +25 -0
- data/test/fixtures/html/entities.html +15 -0
- data/test/fixtures/html/simple.html +17 -0
- data/test/test_helper.rb +18 -0
- data/test/unit/etag_filter_test.rb +23 -0
- data/test/unit/html_content_extractor_test.rb +64 -0
- data/test/unit/url_filters_test.rb +96 -0
- metadata +102 -0
@@ -0,0 +1,25 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
|
5
|
+
<meta name="DC.title" content="Title from DC meta data" />
|
6
|
+
<title>Sample Title</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<h1 class="title">Sample Title in h1</h1>
|
10
|
+
<a href="http://test.host/outside.html">Affe</a>
|
11
|
+
<h1>Some sample <span>text</span></h1>
|
12
|
+
<div id="content">
|
13
|
+
<a href="/inside.html">Affe</a>
|
14
|
+
<!-- some comment -->
|
15
|
+
<p>
|
16
|
+
Real content is here.</p>
|
17
|
+
</div>
|
18
|
+
<!-- another comment
|
19
|
+
here -->
|
20
|
+
<p>Some footer</p>
|
21
|
+
<a href="#foo">Top</a>
|
22
|
+
<a href="/footer.html">Affe</a>
|
23
|
+
</body>
|
24
|
+
</html>
|
25
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
|
5
|
+
<title>Sample & Title</title>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
<h1>Some > Links</h1>
|
9
|
+
<p>don't break me!</p>
|
10
|
+
<a href="http://test.host/affe.html?b=a&c=d">Affe</a>
|
11
|
+
<a href="http://test.host/affe2.html?b=a&c=d">Affe</a>
|
12
|
+
<h1>Ümläuts</h1>
|
13
|
+
<p>heiß hier ß</p>
|
14
|
+
</body>
|
15
|
+
</html>
|
@@ -0,0 +1,17 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
|
5
|
+
<title>Sample Title</title>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
<h1>A Link</h1>
|
9
|
+
<a href="http://test.host/affe.html">Affe</a>
|
10
|
+
<h1>Some sample <span>text</span></h1>
|
11
|
+
<!-- invalid markup follows -->
|
12
|
+
<p>Lorem<br>
|
13
|
+
<!-- another comment
|
14
|
+
here -->
|
15
|
+
ipsum
|
16
|
+
</body>
|
17
|
+
</html>
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'rdig'
|
3
|
+
#File.expand_path(File.dirname(__FILE__) + "/../init.rb")
|
4
|
+
# require File.expand_path(File.dirname(__FILE__) + "/../init.rb")
|
5
|
+
|
6
|
+
module TestHelper
|
7
|
+
include RDig
|
8
|
+
|
9
|
+
def read_fixture(path)
|
10
|
+
File.open("#{File.expand_path(File.dirname(__FILE__))}/fixtures/#{path}") { |f|
|
11
|
+
f.read
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
def html_doc(name)
|
16
|
+
read_fixture("html/#{name}.html")
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class ETagFilterTest < Test::Unit::TestCase
|
3
|
+
include TestHelper, RDig
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@filter = ETagFilter.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_add
|
10
|
+
d0 = OpenStruct.new(:etag => nil)
|
11
|
+
assert @filter.apply(d0)
|
12
|
+
|
13
|
+
d1 = OpenStruct.new(:etag => 'abc1234')
|
14
|
+
assert @filter.apply(d1)
|
15
|
+
assert !@filter.apply(d1)
|
16
|
+
|
17
|
+
d2 = OpenStruct.new(:etag => 'abc1235')
|
18
|
+
assert @filter.apply(d2)
|
19
|
+
assert !@filter.apply(d2)
|
20
|
+
assert !@filter.apply(d1)
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class HtmlContentExtractorTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@extractor = ContentExtractors::HtmlContentExtractor
|
7
|
+
@nbsp = [160].pack('U') # non breaking space
|
8
|
+
@config_backup = RDig.config.content_extraction.html.clone
|
9
|
+
end
|
10
|
+
|
11
|
+
def teardown
|
12
|
+
RDig.config.content_extraction.html = @config_backup
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_simple
|
16
|
+
result = @extractor.process(html_doc('simple'))
|
17
|
+
assert_not_nil result
|
18
|
+
assert_equal 'Sample Title', result[:title]
|
19
|
+
assert_not_nil result[:content]
|
20
|
+
assert_not_nil result[:links]
|
21
|
+
assert_equal 1, result[:links].size
|
22
|
+
assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
|
23
|
+
assert_equal 'http://test.host/affe.html', result[:links].first
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_entities
|
27
|
+
result = @extractor.process(html_doc('entities'))
|
28
|
+
assert_equal 'Sample & Title', result[:title]
|
29
|
+
assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
|
30
|
+
assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
|
31
|
+
assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_custom_content_element
|
35
|
+
RDig.configuration do |config|
|
36
|
+
config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
|
37
|
+
tagsoup.find('h1', :attrs => { 'class', 'title' })
|
38
|
+
end
|
39
|
+
config.content_extraction.html.content_tag_selector = lambda do |tagsoup|
|
40
|
+
tagsoup.find('div', :attrs => { 'id', 'content' })
|
41
|
+
end
|
42
|
+
end
|
43
|
+
result = @extractor.process(html_doc('custom_tag_selectors'))
|
44
|
+
assert_equal 'Sample Title in h1', result[:title]
|
45
|
+
assert_equal 'Affe Real content is here.', result[:content]
|
46
|
+
# check if links are collected outside the content tag, too:
|
47
|
+
assert_equal 3, result[:links].size
|
48
|
+
assert_equal 'http://test.host/outside.html', result[:links].first
|
49
|
+
assert_equal '/inside.html', result[:links][1]
|
50
|
+
assert_equal '/footer.html', result[:links][2]
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_title_from_dcmeta
|
54
|
+
RDig.configuration do |config|
|
55
|
+
config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
|
56
|
+
tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
57
|
+
end
|
58
|
+
end
|
59
|
+
result = @extractor.process(html_doc('custom_tag_selectors'))
|
60
|
+
assert_equal 'Title from DC meta data', result[:title]
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class UrlFilterTest < Test::Unit::TestCase
|
3
|
+
include TestHelper, RDig
|
4
|
+
|
5
|
+
def setup
|
6
|
+
end
|
7
|
+
|
8
|
+
# test a chain configured with direct parameters
|
9
|
+
def test_filterchain
|
10
|
+
cfg = [
|
11
|
+
{ UrlFilters::UrlInclusionFilter => /.+html$/ },
|
12
|
+
{ :hostname_filter => 'test.host' }
|
13
|
+
]
|
14
|
+
chain = UrlFilters::FilterChain.new(cfg)
|
15
|
+
|
16
|
+
assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
|
17
|
+
assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
|
18
|
+
assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
|
19
|
+
end
|
20
|
+
|
21
|
+
# test default chain config
|
22
|
+
def test_default_filterchain
|
23
|
+
chain = UrlFilters::FilterChain.new(RDig.filter_chain)
|
24
|
+
assert_nil chain.apply(Document.new("http://www.example.com/affe.htm"))
|
25
|
+
assert_not_nil chain.apply(Document.new("http://localhost:3000/affe.html"))
|
26
|
+
assert_nil chain.apply(Document.new("http://localhost.com/affe.html"))
|
27
|
+
end
|
28
|
+
|
29
|
+
# check lookup of chain parameters from config
|
30
|
+
def test_filterchain_config
|
31
|
+
RDig.configuration do |conf|
|
32
|
+
conf.crawler.include_patterns = /.+html$/
|
33
|
+
conf.crawler.include_hosts = 'test.host'
|
34
|
+
end
|
35
|
+
cfg = [
|
36
|
+
{ UrlFilters::UrlInclusionFilter => :include_patterns },
|
37
|
+
{ :hostname_filter => :include_hosts }
|
38
|
+
]
|
39
|
+
chain = UrlFilters::FilterChain.new(cfg)
|
40
|
+
|
41
|
+
assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
|
42
|
+
assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
|
43
|
+
assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_urlpattern_filter
|
47
|
+
f = UrlFilters::UrlInclusionFilter.new(/.*\.html$/)
|
48
|
+
assert_nil f.apply(Document.new("http://test.host/affe.htm"))
|
49
|
+
assert_not_nil f.apply(Document.new("http://test.host/affe.html"))
|
50
|
+
f = UrlFilters::UrlExclusionFilter.new([ /.*\.html$/, /.*\.aspx/ ])
|
51
|
+
assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
|
52
|
+
assert_nil f.apply(Document.new("http://test.host/affe.html"))
|
53
|
+
assert_nil f.apply(Document.new("http://test.host/affe.aspx"))
|
54
|
+
f = UrlFilters::UrlExclusionFilter.new([ /http:\/\/[^\/]+\/dir1/ ])
|
55
|
+
assert_nil f.apply(Document.new("http://test.host/dir1/affe.aspx"))
|
56
|
+
assert_not_nil f.apply(Document.new("http://test.host/dir2/dir1/affe.htm"))
|
57
|
+
assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
|
58
|
+
assert_not_nil f.apply(Document.new("http://test.host/dir2/affe.htm"))
|
59
|
+
f = UrlFilters::UrlExclusionFilter.new([ /\/dir1/ ])
|
60
|
+
assert_nil f.apply(Document.new("http://test.host/dir1/affe.aspx"))
|
61
|
+
assert_nil f.apply(Document.new("http://test.host/dir2/dir1/affe.htm"))
|
62
|
+
assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
|
63
|
+
assert_not_nil f.apply(Document.new("http://test.host/dir2/affe.htm"))
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_hostname_filter
|
67
|
+
include_hosts = [ 'test.host', 'localhost' ]
|
68
|
+
assert_nil UrlFilters.hostname_filter(Document.new('http://google.com/'), include_hosts)
|
69
|
+
assert_not_nil UrlFilters.hostname_filter(Document.new('http://test.host/file.html'), include_hosts)
|
70
|
+
assert_not_nil UrlFilters.hostname_filter(Document.new('http://localhost/file.html'), include_hosts)
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_fix_relative_uri
|
74
|
+
doc = Document.new('http://test.host/dir/file.html')
|
75
|
+
assert_equal('http://test.host/dir/another.html',
|
76
|
+
UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
|
77
|
+
assert_equal('http://test.host/dir/../another.html',
|
78
|
+
UrlFilters.fix_relative_uri(Document.new('../another.html', doc.uri)).uri.to_s)
|
79
|
+
assert_equal('http://test.host/dir/another.html',
|
80
|
+
UrlFilters.fix_relative_uri(Document.new('/dir/another.html', doc.uri)).uri.to_s)
|
81
|
+
assert_equal('http://test.host/dir/another.html',
|
82
|
+
UrlFilters.fix_relative_uri(Document.new('http://test.host/dir/another.html', doc.uri)).uri.to_s)
|
83
|
+
assert_equal('HTTP://test.host/dir/another.html',
|
84
|
+
UrlFilters.fix_relative_uri(Document.new('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
|
85
|
+
doc = Document.new('https://test.host/dir/')
|
86
|
+
assert_equal('https://test.host/dir/another.html',
|
87
|
+
UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
|
88
|
+
doc = Document.new('https://test.host/')
|
89
|
+
assert_equal('https://test.host/another.html',
|
90
|
+
UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
|
91
|
+
doc = Document.new('https://test.host')
|
92
|
+
assert_equal('https://test.host/another.html',
|
93
|
+
UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
metadata
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.10
|
3
|
+
specification_version: 1
|
4
|
+
name: rdig
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2006-03-25
|
8
|
+
summary: Ruby based web site indexing and searching library.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: jk@jkraemer.net
|
12
|
+
homepage: http://rdig.rubyforge.org/
|
13
|
+
rubyforge_project: rdig
|
14
|
+
description: "RDig provides an HTTP crawler and content extraction utilities to help building
|
15
|
+
a site search for web sites or intranets. Internally, Ferret is used for the
|
16
|
+
full text indexing. After creating a config file for your site, the index can
|
17
|
+
be built with a single call to rdig."
|
18
|
+
autorequire:
|
19
|
+
default_executable: rdig
|
20
|
+
bindir: bin
|
21
|
+
has_rdoc: true
|
22
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
23
|
+
requirements:
|
24
|
+
-
|
25
|
+
- ">"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: 0.0.0
|
28
|
+
version:
|
29
|
+
platform: ruby
|
30
|
+
authors:
|
31
|
+
- Jens Kraemer
|
32
|
+
files:
|
33
|
+
- bin/rdig
|
34
|
+
- lib/rdig
|
35
|
+
- lib/htmlentities
|
36
|
+
- lib/rdig.rb
|
37
|
+
- lib/rdig/http_client.rb
|
38
|
+
- lib/rdig/crawler.rb
|
39
|
+
- lib/rdig/search.rb
|
40
|
+
- lib/rdig/highlight.rb
|
41
|
+
- lib/rdig/index.rb
|
42
|
+
- lib/rdig/url_filters.rb
|
43
|
+
- lib/rdig/content_extractors.rb
|
44
|
+
- lib/htmlentities/CHANGES
|
45
|
+
- lib/htmlentities/COPYING
|
46
|
+
- lib/htmlentities/README
|
47
|
+
- lib/htmlentities/htmlentities.rb
|
48
|
+
- test/unit
|
49
|
+
- test/fixtures
|
50
|
+
- test/test_helper.rb
|
51
|
+
- test/unit/etag_filter_test.rb
|
52
|
+
- test/unit/url_filters_test.rb
|
53
|
+
- test/unit/html_content_extractor_test.rb
|
54
|
+
- test/fixtures/html
|
55
|
+
- test/fixtures/html/entities.html
|
56
|
+
- test/fixtures/html/simple.html
|
57
|
+
- test/fixtures/html/custom_tag_selectors.html
|
58
|
+
- doc/examples
|
59
|
+
- doc/examples/config.rb
|
60
|
+
- LICENSE
|
61
|
+
- TODO
|
62
|
+
- CHANGES
|
63
|
+
- README
|
64
|
+
- install.rb
|
65
|
+
- rakefile
|
66
|
+
test_files: []
|
67
|
+
rdoc_options:
|
68
|
+
- "--title"
|
69
|
+
- "Rake -- Ruby Make"
|
70
|
+
- "--main"
|
71
|
+
- README
|
72
|
+
- "--line-numbers"
|
73
|
+
extra_rdoc_files:
|
74
|
+
- README
|
75
|
+
- CHANGES
|
76
|
+
- LICENSE
|
77
|
+
- TODO
|
78
|
+
executables:
|
79
|
+
- rdig
|
80
|
+
extensions: []
|
81
|
+
requirements: []
|
82
|
+
dependencies:
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: ferret
|
85
|
+
version_requirement:
|
86
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
87
|
+
requirements:
|
88
|
+
-
|
89
|
+
- ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: 0.3.2
|
92
|
+
version:
|
93
|
+
- !ruby/object:Gem::Dependency
|
94
|
+
name: rubyful_soup
|
95
|
+
version_requirement:
|
96
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
97
|
+
requirements:
|
98
|
+
-
|
99
|
+
- ">="
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: 1.0.4
|
102
|
+
version:
|