rdig 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +2 -0
- data/LICENSE +20 -0
- data/README +61 -0
- data/TODO +0 -0
- data/bin/rdig +32 -0
- data/doc/examples/config.rb +53 -0
- data/install.rb +89 -0
- data/lib/htmlentities/CHANGES +21 -0
- data/lib/htmlentities/COPYING +7 -0
- data/lib/htmlentities/README +15 -0
- data/lib/htmlentities/htmlentities.rb +281 -0
- data/lib/rdig.rb +243 -0
- data/lib/rdig/content_extractors.rb +145 -0
- data/lib/rdig/crawler.rb +176 -0
- data/lib/rdig/highlight.rb +24 -0
- data/lib/rdig/http_client.rb +22 -0
- data/lib/rdig/index.rb +39 -0
- data/lib/rdig/search.rb +77 -0
- data/lib/rdig/url_filters.rb +171 -0
- data/rakefile +325 -0
- data/test/fixtures/html/custom_tag_selectors.html +25 -0
- data/test/fixtures/html/entities.html +15 -0
- data/test/fixtures/html/simple.html +17 -0
- data/test/test_helper.rb +18 -0
- data/test/unit/etag_filter_test.rb +23 -0
- data/test/unit/html_content_extractor_test.rb +64 -0
- data/test/unit/url_filters_test.rb +96 -0
- metadata +102 -0
@@ -0,0 +1,25 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
|
5
|
+
<meta name="DC.title" content="Title from DC meta data" />
|
6
|
+
<title>Sample Title</title>
|
7
|
+
</head>
|
8
|
+
<body>
|
9
|
+
<h1 class="title">Sample Title in h1</h1>
|
10
|
+
<a href="http://test.host/outside.html">Affe</a>
|
11
|
+
<h1>Some sample <span>text</span></h1>
|
12
|
+
<div id="content">
|
13
|
+
<a href="/inside.html">Affe</a>
|
14
|
+
<!-- some comment -->
|
15
|
+
<p>
|
16
|
+
Real content is here.</p>
|
17
|
+
</div>
|
18
|
+
<!-- another comment
|
19
|
+
here -->
|
20
|
+
<p>Some footer</p>
|
21
|
+
<a href="#foo">Top</a>
|
22
|
+
<a href="/footer.html">Affe</a>
|
23
|
+
</body>
|
24
|
+
</html>
|
25
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
|
5
|
+
<title>Sample & Title</title>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
<h1>Some > Links</h1>
|
9
|
+
<p>don't break me!</p>
|
10
|
+
<a href="http://test.host/affe.html?b=a&c=d">Affe</a>
|
11
|
+
<a href="http://test.host/affe2.html?b=a&c=d">Affe</a>
|
12
|
+
<h1>Ümläuts</h1>
|
13
|
+
<p>heiß hier ß</p>
|
14
|
+
</body>
|
15
|
+
</html>
|
@@ -0,0 +1,17 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
|
5
|
+
<title>Sample Title</title>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
<h1>A Link</h1>
|
9
|
+
<a href="http://test.host/affe.html">Affe</a>
|
10
|
+
<h1>Some sample <span>text</span></h1>
|
11
|
+
<!-- invalid markup follows -->
|
12
|
+
<p>Lorem<br>
|
13
|
+
<!-- another comment
|
14
|
+
here -->
|
15
|
+
ipsum
|
16
|
+
</body>
|
17
|
+
</html>
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'rdig'
|
3
|
+
#File.expand_path(File.dirname(__FILE__) + "/../init.rb")
|
4
|
+
# require File.expand_path(File.dirname(__FILE__) + "/../init.rb")
|
5
|
+
|
6
|
+
module TestHelper
|
7
|
+
include RDig
|
8
|
+
|
9
|
+
def read_fixture(path)
|
10
|
+
File.open("#{File.expand_path(File.dirname(__FILE__))}/fixtures/#{path}") { |f|
|
11
|
+
f.read
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
def html_doc(name)
|
16
|
+
read_fixture("html/#{name}.html")
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class ETagFilterTest < Test::Unit::TestCase
|
3
|
+
include TestHelper, RDig
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@filter = ETagFilter.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def test_add
|
10
|
+
d0 = OpenStruct.new(:etag => nil)
|
11
|
+
assert @filter.apply(d0)
|
12
|
+
|
13
|
+
d1 = OpenStruct.new(:etag => 'abc1234')
|
14
|
+
assert @filter.apply(d1)
|
15
|
+
assert !@filter.apply(d1)
|
16
|
+
|
17
|
+
d2 = OpenStruct.new(:etag => 'abc1235')
|
18
|
+
assert @filter.apply(d2)
|
19
|
+
assert !@filter.apply(d2)
|
20
|
+
assert !@filter.apply(d1)
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class HtmlContentExtractorTest < Test::Unit::TestCase
|
3
|
+
include TestHelper
|
4
|
+
|
5
|
+
def setup
|
6
|
+
@extractor = ContentExtractors::HtmlContentExtractor
|
7
|
+
@nbsp = [160].pack('U') # non breaking space
|
8
|
+
@config_backup = RDig.config.content_extraction.html.clone
|
9
|
+
end
|
10
|
+
|
11
|
+
def teardown
|
12
|
+
RDig.config.content_extraction.html = @config_backup
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_simple
|
16
|
+
result = @extractor.process(html_doc('simple'))
|
17
|
+
assert_not_nil result
|
18
|
+
assert_equal 'Sample Title', result[:title]
|
19
|
+
assert_not_nil result[:content]
|
20
|
+
assert_not_nil result[:links]
|
21
|
+
assert_equal 1, result[:links].size
|
22
|
+
assert_equal 'A Link Affe Some sample text Lorem ipsum', result[:content]
|
23
|
+
assert_equal 'http://test.host/affe.html', result[:links].first
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_entities
|
27
|
+
result = @extractor.process(html_doc('entities'))
|
28
|
+
assert_equal 'Sample & Title', result[:title]
|
29
|
+
assert_equal 'http://test.host/affe.html?b=a&c=d', result[:links].first
|
30
|
+
assert_equal 'http://test.host/affe2.html?b=a&c=d', result[:links].last
|
31
|
+
assert_equal "Some > Links don't#{@nbsp}break me! Affe Affe Ümläuts heiß hier ß", result[:content]
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_custom_content_element
|
35
|
+
RDig.configuration do |config|
|
36
|
+
config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
|
37
|
+
tagsoup.find('h1', :attrs => { 'class', 'title' })
|
38
|
+
end
|
39
|
+
config.content_extraction.html.content_tag_selector = lambda do |tagsoup|
|
40
|
+
tagsoup.find('div', :attrs => { 'id', 'content' })
|
41
|
+
end
|
42
|
+
end
|
43
|
+
result = @extractor.process(html_doc('custom_tag_selectors'))
|
44
|
+
assert_equal 'Sample Title in h1', result[:title]
|
45
|
+
assert_equal 'Affe Real content is here.', result[:content]
|
46
|
+
# check if links are collected outside the content tag, too:
|
47
|
+
assert_equal 3, result[:links].size
|
48
|
+
assert_equal 'http://test.host/outside.html', result[:links].first
|
49
|
+
assert_equal '/inside.html', result[:links][1]
|
50
|
+
assert_equal '/footer.html', result[:links][2]
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_title_from_dcmeta
|
54
|
+
RDig.configuration do |config|
|
55
|
+
config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
|
56
|
+
tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
57
|
+
end
|
58
|
+
end
|
59
|
+
result = @extractor.process(html_doc('custom_tag_selectors'))
|
60
|
+
assert_equal 'Title from DC meta data', result[:title]
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
class UrlFilterTest < Test::Unit::TestCase
|
3
|
+
include TestHelper, RDig
|
4
|
+
|
5
|
+
def setup
|
6
|
+
end
|
7
|
+
|
8
|
+
# test a chain configured with direct parameters
|
9
|
+
def test_filterchain
|
10
|
+
cfg = [
|
11
|
+
{ UrlFilters::UrlInclusionFilter => /.+html$/ },
|
12
|
+
{ :hostname_filter => 'test.host' }
|
13
|
+
]
|
14
|
+
chain = UrlFilters::FilterChain.new(cfg)
|
15
|
+
|
16
|
+
assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
|
17
|
+
assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
|
18
|
+
assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
|
19
|
+
end
|
20
|
+
|
21
|
+
# test default chain config
|
22
|
+
def test_default_filterchain
|
23
|
+
chain = UrlFilters::FilterChain.new(RDig.filter_chain)
|
24
|
+
assert_nil chain.apply(Document.new("http://www.example.com/affe.htm"))
|
25
|
+
assert_not_nil chain.apply(Document.new("http://localhost:3000/affe.html"))
|
26
|
+
assert_nil chain.apply(Document.new("http://localhost.com/affe.html"))
|
27
|
+
end
|
28
|
+
|
29
|
+
# check lookup of chain parameters from config
|
30
|
+
def test_filterchain_config
|
31
|
+
RDig.configuration do |conf|
|
32
|
+
conf.crawler.include_patterns = /.+html$/
|
33
|
+
conf.crawler.include_hosts = 'test.host'
|
34
|
+
end
|
35
|
+
cfg = [
|
36
|
+
{ UrlFilters::UrlInclusionFilter => :include_patterns },
|
37
|
+
{ :hostname_filter => :include_hosts }
|
38
|
+
]
|
39
|
+
chain = UrlFilters::FilterChain.new(cfg)
|
40
|
+
|
41
|
+
assert_nil chain.apply(Document.new("http://test.host/affe.htm"))
|
42
|
+
assert_not_nil chain.apply(Document.new("http://test.host/affe.html"))
|
43
|
+
assert_nil chain.apply(Document.new("http://test.host.com/affe.html"))
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_urlpattern_filter
|
47
|
+
f = UrlFilters::UrlInclusionFilter.new(/.*\.html$/)
|
48
|
+
assert_nil f.apply(Document.new("http://test.host/affe.htm"))
|
49
|
+
assert_not_nil f.apply(Document.new("http://test.host/affe.html"))
|
50
|
+
f = UrlFilters::UrlExclusionFilter.new([ /.*\.html$/, /.*\.aspx/ ])
|
51
|
+
assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
|
52
|
+
assert_nil f.apply(Document.new("http://test.host/affe.html"))
|
53
|
+
assert_nil f.apply(Document.new("http://test.host/affe.aspx"))
|
54
|
+
f = UrlFilters::UrlExclusionFilter.new([ /http:\/\/[^\/]+\/dir1/ ])
|
55
|
+
assert_nil f.apply(Document.new("http://test.host/dir1/affe.aspx"))
|
56
|
+
assert_not_nil f.apply(Document.new("http://test.host/dir2/dir1/affe.htm"))
|
57
|
+
assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
|
58
|
+
assert_not_nil f.apply(Document.new("http://test.host/dir2/affe.htm"))
|
59
|
+
f = UrlFilters::UrlExclusionFilter.new([ /\/dir1/ ])
|
60
|
+
assert_nil f.apply(Document.new("http://test.host/dir1/affe.aspx"))
|
61
|
+
assert_nil f.apply(Document.new("http://test.host/dir2/dir1/affe.htm"))
|
62
|
+
assert_not_nil f.apply(Document.new("http://test.host/affe.htm"))
|
63
|
+
assert_not_nil f.apply(Document.new("http://test.host/dir2/affe.htm"))
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_hostname_filter
|
67
|
+
include_hosts = [ 'test.host', 'localhost' ]
|
68
|
+
assert_nil UrlFilters.hostname_filter(Document.new('http://google.com/'), include_hosts)
|
69
|
+
assert_not_nil UrlFilters.hostname_filter(Document.new('http://test.host/file.html'), include_hosts)
|
70
|
+
assert_not_nil UrlFilters.hostname_filter(Document.new('http://localhost/file.html'), include_hosts)
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_fix_relative_uri
|
74
|
+
doc = Document.new('http://test.host/dir/file.html')
|
75
|
+
assert_equal('http://test.host/dir/another.html',
|
76
|
+
UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
|
77
|
+
assert_equal('http://test.host/dir/../another.html',
|
78
|
+
UrlFilters.fix_relative_uri(Document.new('../another.html', doc.uri)).uri.to_s)
|
79
|
+
assert_equal('http://test.host/dir/another.html',
|
80
|
+
UrlFilters.fix_relative_uri(Document.new('/dir/another.html', doc.uri)).uri.to_s)
|
81
|
+
assert_equal('http://test.host/dir/another.html',
|
82
|
+
UrlFilters.fix_relative_uri(Document.new('http://test.host/dir/another.html', doc.uri)).uri.to_s)
|
83
|
+
assert_equal('HTTP://test.host/dir/another.html',
|
84
|
+
UrlFilters.fix_relative_uri(Document.new('HTTP://test.host/dir/another.html', doc.uri)).uri.to_s)
|
85
|
+
doc = Document.new('https://test.host/dir/')
|
86
|
+
assert_equal('https://test.host/dir/another.html',
|
87
|
+
UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
|
88
|
+
doc = Document.new('https://test.host/')
|
89
|
+
assert_equal('https://test.host/another.html',
|
90
|
+
UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
|
91
|
+
doc = Document.new('https://test.host')
|
92
|
+
assert_equal('https://test.host/another.html',
|
93
|
+
UrlFilters.fix_relative_uri(Document.new('another.html', doc.uri)).uri.to_s)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
metadata
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.10
|
3
|
+
specification_version: 1
|
4
|
+
name: rdig
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2006-03-25
|
8
|
+
summary: Ruby based web site indexing and searching library.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: jk@jkraemer.net
|
12
|
+
homepage: http://rdig.rubyforge.org/
|
13
|
+
rubyforge_project: rdig
|
14
|
+
description: "RDig provides an HTTP crawler and content extraction utilities to help building
|
15
|
+
a site search for web sites or intranets. Internally, Ferret is used for the
|
16
|
+
full text indexing. After creating a config file for your site, the index can
|
17
|
+
be built with a single call to rdig."
|
18
|
+
autorequire:
|
19
|
+
default_executable: rdig
|
20
|
+
bindir: bin
|
21
|
+
has_rdoc: true
|
22
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
23
|
+
requirements:
|
24
|
+
-
|
25
|
+
- ">"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: 0.0.0
|
28
|
+
version:
|
29
|
+
platform: ruby
|
30
|
+
authors:
|
31
|
+
- Jens Kraemer
|
32
|
+
files:
|
33
|
+
- bin/rdig
|
34
|
+
- lib/rdig
|
35
|
+
- lib/htmlentities
|
36
|
+
- lib/rdig.rb
|
37
|
+
- lib/rdig/http_client.rb
|
38
|
+
- lib/rdig/crawler.rb
|
39
|
+
- lib/rdig/search.rb
|
40
|
+
- lib/rdig/highlight.rb
|
41
|
+
- lib/rdig/index.rb
|
42
|
+
- lib/rdig/url_filters.rb
|
43
|
+
- lib/rdig/content_extractors.rb
|
44
|
+
- lib/htmlentities/CHANGES
|
45
|
+
- lib/htmlentities/COPYING
|
46
|
+
- lib/htmlentities/README
|
47
|
+
- lib/htmlentities/htmlentities.rb
|
48
|
+
- test/unit
|
49
|
+
- test/fixtures
|
50
|
+
- test/test_helper.rb
|
51
|
+
- test/unit/etag_filter_test.rb
|
52
|
+
- test/unit/url_filters_test.rb
|
53
|
+
- test/unit/html_content_extractor_test.rb
|
54
|
+
- test/fixtures/html
|
55
|
+
- test/fixtures/html/entities.html
|
56
|
+
- test/fixtures/html/simple.html
|
57
|
+
- test/fixtures/html/custom_tag_selectors.html
|
58
|
+
- doc/examples
|
59
|
+
- doc/examples/config.rb
|
60
|
+
- LICENSE
|
61
|
+
- TODO
|
62
|
+
- CHANGES
|
63
|
+
- README
|
64
|
+
- install.rb
|
65
|
+
- rakefile
|
66
|
+
test_files: []
|
67
|
+
rdoc_options:
|
68
|
+
- "--title"
|
69
|
+
- "Rake -- Ruby Make"
|
70
|
+
- "--main"
|
71
|
+
- README
|
72
|
+
- "--line-numbers"
|
73
|
+
extra_rdoc_files:
|
74
|
+
- README
|
75
|
+
- CHANGES
|
76
|
+
- LICENSE
|
77
|
+
- TODO
|
78
|
+
executables:
|
79
|
+
- rdig
|
80
|
+
extensions: []
|
81
|
+
requirements: []
|
82
|
+
dependencies:
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: ferret
|
85
|
+
version_requirement:
|
86
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
87
|
+
requirements:
|
88
|
+
-
|
89
|
+
- ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: 0.3.2
|
92
|
+
version:
|
93
|
+
- !ruby/object:Gem::Dependency
|
94
|
+
name: rubyful_soup
|
95
|
+
version_requirement:
|
96
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
97
|
+
requirements:
|
98
|
+
-
|
99
|
+
- ">="
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: 1.0.4
|
102
|
+
version:
|