rdig 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +0 -3
- data/lib/rdig.rb +1 -1
- data/lib/rdig/crawler.rb +4 -1
- data/rakefile +16 -3
- data/test/unit/html_content_extractor_test.rb +14 -0
- metadata +2 -2
data/README
CHANGED
@@ -52,9 +52,6 @@ from doc/examples/config.rb. The tag_selector properties are called
|
|
52
52
|
with a BeautifulSoup instance as parameter. See the RubyfulSoup Site[http://www.crummy.com/software/RubyfulSoup/documentation.html] for more info about this cool lib.
|
53
53
|
You can also have a look at the +html_content_extractor+ unit test.
|
54
54
|
|
55
|
-
See [] for API documentation of the
|
56
|
-
Rubyful Soup lib used
|
57
|
-
|
58
55
|
:include:doc/examples/config.rb
|
59
56
|
|
60
57
|
|
data/lib/rdig.rb
CHANGED
data/lib/rdig/crawler.rb
CHANGED
@@ -49,7 +49,10 @@ module RDig
|
|
49
49
|
def process_document(doc, filterchain)
|
50
50
|
doc.fetch
|
51
51
|
# add links from this document to the queue
|
52
|
-
doc.content[:links].each { |url|
|
52
|
+
doc.content[:links].each { |url|
|
53
|
+
add_url(url, filterchain, doc)
|
54
|
+
} unless doc.content[:links].nil?
|
55
|
+
|
53
56
|
return unless @etag_filter.apply(doc)
|
54
57
|
case doc.status
|
55
58
|
when :success
|
data/rakefile
CHANGED
@@ -39,8 +39,8 @@ PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
|
|
39
39
|
|
40
40
|
RELEASE_NAME = "REL #{PKG_VERSION}"
|
41
41
|
|
42
|
-
|
43
|
-
|
42
|
+
RUBYFORGE_PROJECT = "rdig"
|
43
|
+
RUBYFORGE_USER = "jkraemer"
|
44
44
|
|
45
45
|
PKG_FILES = FileList[
|
46
46
|
"bin/**/*",
|
@@ -323,8 +323,21 @@ task :tag => [:prerelease] do
|
|
323
323
|
end
|
324
324
|
end
|
325
325
|
|
326
|
+
# --------------------------------------------------------------------
|
327
|
+
# Upload release to rubyforge
|
328
|
+
desc "Upload release to rubyforge"
|
329
|
+
task :prel do
|
330
|
+
`rubyforge login`
|
331
|
+
#for ext in %w( gem tgz )
|
332
|
+
for ext in %w( gem )
|
333
|
+
release_command = "rubyforge add_release #{RUBYFORGE_PROJECT} #{PKG_NAME} '#{PKG_VERSION}' pkg/#{PKG_NAME}-#{PKG_VERSION}.#{ext}"
|
334
|
+
puts release_command
|
335
|
+
system(release_command)
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
326
339
|
# Publish RDocs ------------------------------------------------------
|
327
340
|
desc "Publish the API documentation"
|
328
341
|
task :pdoc => [:rdoc] do
|
329
|
-
Rake::RubyForgePublisher.new(
|
342
|
+
Rake::RubyForgePublisher.new(RUBYFORGE_PROJECT, RUBYFORGE_USER).upload
|
330
343
|
end
|
@@ -59,6 +59,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
59
59
|
assert_equal '/footer.html', result[:links][2]
|
60
60
|
end
|
61
61
|
|
62
|
+
|
62
63
|
def test_title_from_dcmeta
|
63
64
|
RDig.configuration do |config|
|
64
65
|
config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
|
@@ -69,5 +70,18 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
69
70
|
assert_equal 'Title from DC meta data', result[:title]
|
70
71
|
end
|
71
72
|
|
73
|
+
def test_preprocessed_title
|
74
|
+
RDig.configuration do |config|
|
75
|
+
config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
|
76
|
+
title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
77
|
+
# use only a portion of the title tag's contents if it matches our
|
78
|
+
# regexp:
|
79
|
+
title =~ /^(.*)meta data$/ ? $1.strip : title.strip
|
80
|
+
end
|
81
|
+
end
|
82
|
+
result = @extractor.process(html_doc('custom_tag_selectors'))
|
83
|
+
assert_equal 'Title from DC', result[:title]
|
84
|
+
end
|
85
|
+
|
72
86
|
end
|
73
87
|
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
|
|
3
3
|
specification_version: 1
|
4
4
|
name: rdig
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.2.
|
7
|
-
date: 2006-04-
|
6
|
+
version: 0.2.1
|
7
|
+
date: 2006-04-20 00:00:00 +02:00
|
8
8
|
summary: Ruby based web site indexing and searching library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|