rdig 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +0 -3
- data/lib/rdig.rb +1 -1
- data/lib/rdig/crawler.rb +4 -1
- data/rakefile +16 -3
- data/test/unit/html_content_extractor_test.rb +14 -0
- metadata +2 -2
data/README
CHANGED
@@ -52,9 +52,6 @@ from doc/examples/config.rb. The tag_selector properties are called
|
|
52
52
|
with a BeautifulSoup instance as parameter. See the RubyfulSoup Site[http://www.crummy.com/software/RubyfulSoup/documentation.html] for more info about this cool lib.
|
53
53
|
You can also have a look at the +html_content_extractor+ unit test.
|
54
54
|
|
55
|
-
See [] for API documentation of the
|
56
|
-
Rubyful Soup lib used
|
57
|
-
|
58
55
|
:include:doc/examples/config.rb
|
59
56
|
|
60
57
|
|
data/lib/rdig.rb
CHANGED
data/lib/rdig/crawler.rb
CHANGED
@@ -49,7 +49,10 @@ module RDig
|
|
49
49
|
def process_document(doc, filterchain)
|
50
50
|
doc.fetch
|
51
51
|
# add links from this document to the queue
|
52
|
-
doc.content[:links].each { |url|
|
52
|
+
doc.content[:links].each { |url|
|
53
|
+
add_url(url, filterchain, doc)
|
54
|
+
} unless doc.content[:links].nil?
|
55
|
+
|
53
56
|
return unless @etag_filter.apply(doc)
|
54
57
|
case doc.status
|
55
58
|
when :success
|
data/rakefile
CHANGED
@@ -39,8 +39,8 @@ PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
|
|
39
39
|
|
40
40
|
RELEASE_NAME = "REL #{PKG_VERSION}"
|
41
41
|
|
42
|
-
|
43
|
-
|
42
|
+
RUBYFORGE_PROJECT = "rdig"
|
43
|
+
RUBYFORGE_USER = "jkraemer"
|
44
44
|
|
45
45
|
PKG_FILES = FileList[
|
46
46
|
"bin/**/*",
|
@@ -323,8 +323,21 @@ task :tag => [:prerelease] do
|
|
323
323
|
end
|
324
324
|
end
|
325
325
|
|
326
|
+
# --------------------------------------------------------------------
|
327
|
+
# Upload release to rubyforge
|
328
|
+
desc "Upload release to rubyforge"
|
329
|
+
task :prel do
|
330
|
+
`rubyforge login`
|
331
|
+
#for ext in %w( gem tgz )
|
332
|
+
for ext in %w( gem )
|
333
|
+
release_command = "rubyforge add_release #{RUBYFORGE_PROJECT} #{PKG_NAME} '#{PKG_VERSION}' pkg/#{PKG_NAME}-#{PKG_VERSION}.#{ext}"
|
334
|
+
puts release_command
|
335
|
+
system(release_command)
|
336
|
+
end
|
337
|
+
end
|
338
|
+
|
326
339
|
# Publish RDocs ------------------------------------------------------
|
327
340
|
desc "Publish the API documentation"
|
328
341
|
task :pdoc => [:rdoc] do
|
329
|
-
Rake::RubyForgePublisher.new(
|
342
|
+
Rake::RubyForgePublisher.new(RUBYFORGE_PROJECT, RUBYFORGE_USER).upload
|
330
343
|
end
|
@@ -59,6 +59,7 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
59
59
|
assert_equal '/footer.html', result[:links][2]
|
60
60
|
end
|
61
61
|
|
62
|
+
|
62
63
|
def test_title_from_dcmeta
|
63
64
|
RDig.configuration do |config|
|
64
65
|
config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
|
@@ -69,5 +70,18 @@ class HtmlContentExtractorTest < Test::Unit::TestCase
|
|
69
70
|
assert_equal 'Title from DC meta data', result[:title]
|
70
71
|
end
|
71
72
|
|
73
|
+
def test_preprocessed_title
|
74
|
+
RDig.configuration do |config|
|
75
|
+
config.content_extraction.html.title_tag_selector = lambda do |tagsoup|
|
76
|
+
title = tagsoup.find('meta', :attrs => { 'name', 'DC.title' })['content']
|
77
|
+
# use only a portion of the title tag's contents if it matches our
|
78
|
+
# regexp:
|
79
|
+
title =~ /^(.*)meta data$/ ? $1.strip : title.strip
|
80
|
+
end
|
81
|
+
end
|
82
|
+
result = @extractor.process(html_doc('custom_tag_selectors'))
|
83
|
+
assert_equal 'Title from DC', result[:title]
|
84
|
+
end
|
85
|
+
|
72
86
|
end
|
73
87
|
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.8.11
|
|
3
3
|
specification_version: 1
|
4
4
|
name: rdig
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.2.
|
7
|
-
date: 2006-04-
|
6
|
+
version: 0.2.1
|
7
|
+
date: 2006-04-20 00:00:00 +02:00
|
8
8
|
summary: Ruby based web site indexing and searching library.
|
9
9
|
require_paths:
|
10
10
|
- lib
|