rdig 0.3.5 → 0.3.8

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGES CHANGED
@@ -1,8 +1,13 @@
1
+ 0.3.6
2
+ - remove bundled htmlentities in favor of a gem dependency
3
+ - also extract links from area and frame tags
4
+ - fix etagfilter bug
5
+
1
6
  0.3.5
2
7
  - Add max_depth option to crawler configuration for limiting the crawl to a
3
8
  specific depth
4
9
  - add support for http proxies including basic authentication
5
- - remove rubyfoul_soup support
10
+ - remove rubyful_soup support
6
11
 
7
12
  0.3.4
8
13
 
@@ -0,0 +1,11 @@
1
+ == 0.3.8 2009-04-26
2
+
3
+ * bump up version
4
+
5
+ == 0.3.7 2009-04-26
6
+
7
+ * Gem spec for automatic gem building on github
8
+ * doc enhancements
9
+ * better uri-normalization, re-add result uri of redirection
10
+ into the queue instea of directly indexing the resulting
11
+ page
@@ -0,0 +1,39 @@
1
+ CHANGES
2
+ History.txt
3
+ install.rb
4
+ LICENSE
5
+ Manifest.txt
6
+ rakefile
7
+ README
8
+ bin/rdig
9
+ doc/examples/config.rb
10
+ lib/rdig/content_extractors/doc.rb
11
+ lib/rdig/content_extractors/hpricot.rb
12
+ lib/rdig/content_extractors/pdf.rb
13
+ lib/rdig/content_extractors.rb
14
+ lib/rdig/crawler.rb
15
+ lib/rdig/documents.rb
16
+ lib/rdig/file.rb
17
+ lib/rdig/highlight.rb
18
+ lib/rdig/index.rb
19
+ lib/rdig/search.rb
20
+ lib/rdig/url_filters.rb
21
+ lib/rdig.rb
22
+ test/fixtures/html/custom_tag_selectors.html
23
+ test/fixtures/html/entities.html
24
+ test/fixtures/html/frameset.html
25
+ test/fixtures/html/imagemap.html
26
+ test/fixtures/html/simple.html
27
+ test/fixtures/pdf/simple.pdf
28
+ test/fixtures/word/simple.doc
29
+ test/test_helper.rb
30
+ test/unit/crawler_fs_test.rb
31
+ test/unit/etag_filter_test.rb
32
+ test/unit/file_document_test.rb
33
+ test/unit/hpricot_content_extractor_test.rb
34
+ test/unit/http_document_test.rb
35
+ test/unit/pdf_content_extractor_test.rb
36
+ test/unit/rdig_test.rb
37
+ test/unit/searcher_test.rb
38
+ test/unit/url_filters_test.rb
39
+ test/unit/word_content_extractor_test.rb
@@ -86,6 +86,13 @@ RDig.configuration do |cfg|
86
86
  # Set to 0 to only index the start_urls.
87
87
  # cfg.crawler.max_depth = nil
88
88
 
89
+ # default index document to be appended to URIs ending with a trailing '/'
90
+ # cfg.crawler.normalize_uri.index_document = nil
91
+ # strip trailing '/' from URIs to avoid double indexing of pages referred by '
92
+ # Ignored if index_document is set.
93
+ # Not necessary when the server issues proper etags since the default etag filter will kill these doublettes.
94
+ # cfg.crawler.normalize_uri.remove_trailing_slash = nil
95
+
89
96
  # http proxy configuration
90
97
  # proxy url
91
98
  # cfg.crawler.http_proxy = nil
@@ -94,6 +101,9 @@ RDig.configuration do |cfg|
94
101
  # cfg.crawler.http_proxy_user = nil
95
102
  # proxy password
96
103
  # cfg.crawler.http_proxy_pass = nil
104
+ #
105
+ # to use basic auth without a proxy, use this syntax:
106
+ # cfg.crawler.open_uri_http_options = { :http_basic_authentication => [user, password] }
97
107
 
98
108
  # indexer options
99
109
 
@@ -24,7 +24,7 @@
24
24
  #++
25
25
  #
26
26
 
27
- RDIGVERSION = '0.3.5'
27
+ RDIGVERSION = '0.3.8'
28
28
 
29
29
 
30
30
  require 'thread'
@@ -49,7 +49,8 @@ rescue LoadError
49
49
  require 'ferret'
50
50
  end
51
51
 
52
- require 'htmlentities/htmlentities'
52
+
53
+ #require 'htmlentities/htmlentities'
53
54
 
54
55
 
55
56
  $KCODE = 'u'
@@ -60,17 +61,16 @@ module RDig
60
61
 
61
62
  class << self
62
63
 
63
- # the filter chains are for limiting the set of indexed documents.
64
- # there are two chain types - one for http, and one for file system
65
- # crawling.
66
- # a document has to survive all filters in the chain to get indexed.
64
+ # Filter chains are used by the crawler to limit the set of documents being indexed.
65
+ # There are two chains - one for http, and one for file system crawling.
66
+ # Each document has to survive all filters in the relevant chain to get indexed.
67
67
  def filter_chain
68
68
  @filter_chain ||= {
69
69
  # filter chain for http crawling
70
70
  :http => [
71
71
  :scheme_filter_http,
72
72
  :fix_relative_uri,
73
- :normalize_uri,
73
+ { :normalize_uri => :normalize_uri },
74
74
  { RDig::UrlFilters::DepthFilter => :max_depth },
75
75
  { :hostname_filter => :include_hosts },
76
76
  { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
@@ -120,7 +120,11 @@ module RDig
120
120
  :wait_before_leave => 10,
121
121
  :http_proxy => nil,
122
122
  :http_proxy_user => nil,
123
- :http_proxy_pass => nil
123
+ :http_proxy_pass => nil,
124
+ :normalize_uri => OpenStruct.new(
125
+ :index_document => nil,
126
+ :remove_trailing_slash => nil
127
+ )
124
128
  ),
125
129
  :content_extraction => OpenStruct.new(
126
130
  # settings for html content extraction (hpricot)
@@ -23,7 +23,13 @@ module RDig
23
23
  def self.extractor_instances
24
24
  @@extractor_instances ||= extractors.map { |ex_class|
25
25
  RDig.logger.info "initializing content extractor: #{ex_class}"
26
- ex_class.new(RDig.configuration.content_extraction) rescue nil
26
+ ex = nil
27
+ begin
28
+ ex = ex_class.new(RDig.configuration.content_extraction)
29
+ rescue Exception
30
+ RDig.logger.error "error: #{$!.message}\n#{$!.backtrace.join("\n")}"
31
+ end
32
+ ex
27
33
  }.compact
28
34
  end
29
35
 
@@ -13,25 +13,11 @@ module RDig
13
13
  @wvhtml = 'wvHtml'
14
14
  @pattern = /^application\/msword/
15
15
  # html extractor for parsing wvHtml output
16
- if defined?(HpricotContentExtractor)
17
- @html_extractor = HpricotContentExtractor.new(OpenStruct.new(
18
- :hpricot => OpenStruct.new(
19
- :content_tag_selector => 'body',
20
- :title_tag_selector => 'title'
21
- )))
22
- elsif defined?(RubyfulSoupContentExtractor)
23
- @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
24
- :rubyful_soup => OpenStruct.new(
25
- :content_tag_selector => lambda { |tagsoup|
26
- tagsoup.html.body
27
- },
28
- :title_tag_selector => lambda { |tagsoup|
29
- tagsoup.html.head.title
30
- }
31
- )))
32
- else
33
- raise "need at least one html content extractor - please install hpricot or rubyful_soup"
34
- end
16
+ @html_extractor = HpricotContentExtractor.new(OpenStruct.new(
17
+ :hpricot => OpenStruct.new(
18
+ :content_tag_selector => 'body',
19
+ :title_tag_selector => 'title'
20
+ )))
35
21
  # TODO: better: if $?.exitstatus == 127 (not found)
36
22
  @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
37
23
  end
@@ -1,11 +1,12 @@
1
1
  begin
2
2
  require 'hpricot'
3
+ require 'htmlentities'
3
4
  rescue LoadError
4
5
  require 'rubygems'
5
6
  require 'hpricot'
7
+ require 'htmlentities'
6
8
  end
7
9
 
8
- if defined?(Hpricot)
9
10
  module RDig
10
11
  module ContentExtractors
11
12
 
@@ -23,11 +24,12 @@ module RDig
23
24
  # :title => 'Title',
24
25
  # :links => [array of urls] }
25
26
  def process(content)
27
+ entities = HTMLEntities.new
26
28
  doc = Hpricot(content)
27
29
  {
28
- :title => extract_title(doc).decode_entities.strip,
30
+ :title => entities.decode(extract_title(doc)).strip,
29
31
  :links => extract_links(doc),
30
- :content => extract_content(doc).decode_entities
32
+ :content => entities.decode(extract_content(doc))
31
33
  }
32
34
  end
33
35
 
@@ -50,12 +52,14 @@ module RDig
50
52
  # extracts the href attributes of all a tags, except
51
53
  # internal links like <a href="#top">
52
54
  def extract_links(doc)
53
- (doc/'a').map { |link|
54
- href = link['href']
55
- CGI.unescapeHTML(href) if href && href !~ /^#/
56
- }.compact
55
+ {'a' => 'href', 'area' => 'href', 'frame' => 'src'}.map do |tag, attr|
56
+ (doc/tag).map do |tag|
57
+ value = tag[attr]
58
+ CGI.unescapeHTML(value) if value && value !~ /^#/
59
+ end
60
+ end.flatten.compact
57
61
  end
58
-
62
+
59
63
  # Extracts the title from the given html tree
60
64
  def extract_title(doc)
61
65
  the_title_tag = title_tag(doc)
@@ -85,6 +89,7 @@ module RDig
85
89
  def strip_comments(string)
86
90
  string.gsub Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), ''
87
91
  end
92
+
88
93
  def strip_tags(string)
89
94
  string.gsub! Regexp.new('<(script|style).*?>.*?<\/(script|style).*?>',
90
95
  Regexp::MULTILINE, 'u'), ''
@@ -98,4 +103,3 @@ module RDig
98
103
 
99
104
  end
100
105
  end
101
- end
@@ -5,7 +5,6 @@ module RDig
5
5
 
6
6
  def initialize(config = RDig.config, logger = RDig.logger)
7
7
  @documents = Queue.new
8
- @etag_filter = ETagFilter.new
9
8
  @logger = logger
10
9
  @config = config
11
10
  end
@@ -22,7 +21,8 @@ module RDig
22
21
  # check whether we are indexing on-disk or via http
23
22
  url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
24
23
  chain_config = RDig.filter_chain[url_type]
25
-
24
+
25
+ @etag_filter = ETagFilter.new
26
26
  filterchain = UrlFilters::FilterChain.new(chain_config)
27
27
  @config.crawler.start_urls.each { |url| add_url(url, filterchain) }
28
28
 
@@ -52,13 +52,21 @@ module RDig
52
52
  def process_document(doc, filterchain)
53
53
  @logger.debug "processing document #{doc}"
54
54
  doc.fetch
55
- # add links from this document to the queue
56
- doc.content[:links].each { |url|
57
- add_url(url, filterchain, doc)
58
- } unless doc.content[:links].nil?
59
-
60
- return unless @etag_filter.apply(doc)
61
- add_to_index doc
55
+ case doc.status
56
+ when :success
57
+ if @etag_filter.apply(doc)
58
+ # add links from this document to the queue
59
+ doc.content[:links].each { |url|
60
+ add_url(url, filterchain, doc)
61
+ } unless doc.content[:links].nil?
62
+ add_to_index doc
63
+ end
64
+ when :redirect
65
+ @logger.debug "redirect to #{doc.content}"
66
+ add_url(doc.content, filterchain, doc)
67
+ else
68
+ @logger.error "unknown doc status #{doc.status}: #{doc}"
69
+ end
62
70
  rescue
63
71
  @logger.error "error processing document #{doc.uri.to_s}: #{$!}"
64
72
  @logger.debug "Trace: #{$!.backtrace.join("\n")}"
@@ -110,7 +118,7 @@ module RDig
110
118
  end
111
119
 
112
120
  def apply(document)
113
- return document unless (document.respond_to?(:etag) && document.etag)
121
+ return document unless (document.respond_to?(:etag) && document.etag && !document.etag.empty?)
114
122
  synchronize do
115
123
  @etags.add?(document.etag) ? document : nil
116
124
  end
@@ -118,16 +118,20 @@ module RDig
118
118
  def fetch
119
119
  RDig.logger.debug "fetching #{@uri.to_s}"
120
120
  open(@uri.to_s, RDig::open_uri_http_options) do |doc|
121
- case doc.status.first.to_i
122
- when 200
123
- @etag = doc.meta['etag']
124
- # puts "etag: #{@etag}"
125
- @content = ContentExtractors.process(doc.read, doc.content_type)
126
- @status = :success
127
- when 404
128
- RDig.logger.info "got 404 for #{@uri}"
121
+ if @uri.to_s != doc.base_uri.to_s
122
+ @status = :redirect
123
+ @content = doc.base_uri
129
124
  else
130
- RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
125
+ case doc.status.first.to_i
126
+ when 200
127
+ @etag = doc.meta['etag']
128
+ @content = ContentExtractors.process(doc.read, doc.content_type)
129
+ @status = :success
130
+ when 404
131
+ RDig.logger.info "got 404 for #{@uri}"
132
+ else
133
+ RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
134
+ end
131
135
  end
132
136
  end
133
137
  rescue
@@ -22,7 +22,7 @@ module RDig
22
22
  end
23
23
 
24
24
  # add a filter and it's args to the chain
25
- # when args is a symbol, it is treated as a configuration key
25
+ # if args is a symbol, it is treated as a configuration key
26
26
  def add(filter, args=nil)
27
27
  args = RDig.config.crawler.send(args) if args.is_a? Symbol
28
28
  case filter
@@ -163,7 +163,7 @@ module RDig
163
163
  return document
164
164
  end
165
165
 
166
- # expands both href="/path/xyz.html" and href="affe.html"
166
+ # expands href="/path/xyz.html", href="affe.html" and href="../lala.html"
167
167
  # to full urls
168
168
  def UrlFilters.fix_relative_uri(document)
169
169
  #return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^https?/i
@@ -175,11 +175,13 @@ module RDig
175
175
  uri.port = ref.port unless uri.port || ref.port==ref.default_port
176
176
  uri.path = ref.path unless uri.path
177
177
 
178
- if uri.path !~ /^\//
178
+ old_uri_path = uri.path
179
+ if uri.path !~ /^\// || uri.path =~ /^\.\./
179
180
  ref_path = ref.path || '/'
180
181
  ref_path << '/' if ref_path.empty?
181
182
  uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
182
- end
183
+ end
184
+ uri.path = uri.path.sub( /\/[^\/]*\/\.\./, "" ) if old_uri_path =~ /^\.\./
183
185
  return document
184
186
  rescue
185
187
  p document
@@ -193,12 +195,17 @@ module RDig
193
195
  return document if include_hosts.nil? || include_hosts.empty? || include_hosts.include?(document.uri.host)
194
196
  end
195
197
 
196
- def UrlFilters.normalize_uri(document)
198
+ def UrlFilters.normalize_uri(document, cfg)
197
199
  document.uri.fragment = nil
198
200
  # document.uri.query = nil
199
- # append index document if configured and path ends with a slash
200
- if RDig.config.index_document && document.uri.path =~ /\/$/
201
- document.uri.path << RDig.config.index_document
201
+ # trailing slash handling
202
+ if document.uri.path =~ /\/$/
203
+ # append index document if configured
204
+ if cfg.index_document
205
+ document.uri.path << RDig.config.index_document
206
+ elsif cfg.remove_trailing_slash
207
+ document.uri.path.gsub! /\/$/, ''
208
+ end
202
209
  end
203
210
  return document
204
211
  end
data/rakefile CHANGED
@@ -132,6 +132,7 @@ else
132
132
 
133
133
  s.add_dependency('ferret', '>= 0.10.0')
134
134
  s.add_dependency('hpricot', '>= 0.6')
135
+ s.add_dependency('htmlentities', '>= 4.0.0')
135
136
  #s.requirements << ""
136
137
 
137
138
  #### Which files are to be included in this gem? Everything! (Except CVS directories.)
@@ -321,7 +322,7 @@ task :tag => [:prerelease] do
321
322
  if ENV['RELTEST']
322
323
  announce "Release Task Testing, skipping tagging"
323
324
  else
324
- sh %{cd ..; svn copy trunk tags/#{reltag}}
325
+ sh %{svn copy svn+ssh://jkraemer@rubyforge.org/var/svn/rdig/trunk svn+ssh://jkraemer@rubyforge.org/var/svn/rdig/tags/#{reltag}}
325
326
  end
326
327
  end
327
328
 
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2
+ <html>
3
+ <head>
4
+ <meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
5
+ <title>Sample &amp; Title</title>
6
+ </head>
7
+ <body>
8
+ <frameset>
9
+ <frame src="http://test.host/first.html" />
10
+ <frame src="/second.html" />
11
+ </frameset>
12
+ </body>
13
+ </html>
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2
+ <html>
3
+ <head>
4
+ <meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
5
+ <title>Sample &amp; Title</title>
6
+ </head>
7
+ <body>
8
+ <map>
9
+ <area href="http://test.host/first.html" />
10
+ <area href="/second.html" />
11
+ </map>
12
+ </body>
13
+ </html>
@@ -52,6 +52,18 @@ class HpricotContentExtractorTest < Test::Unit::TestCase
52
52
  assert_equal '/inside.html', result[:links][1]
53
53
  assert_equal '/footer.html', result[:links][2]
54
54
  end
55
+
56
+ def test_extracts_links_from_frameset
57
+ result = @extractor.process(html_doc('frameset'))
58
+ assert_equal 'http://test.host/first.html', result[:links].first
59
+ assert_equal '/second.html', result[:links].last
60
+ end
61
+
62
+ def test_extracts_links_from_imagemap
63
+ result = @extractor.process(html_doc('imagemap'))
64
+ assert_equal 'http://test.host/first.html', result[:links].first
65
+ assert_equal '/second.html', result[:links].last
66
+ end
55
67
 
56
68
 
57
69
  def test_title_from_dcmeta
@@ -28,8 +28,8 @@ class SearcherTest < Test::Unit::TestCase
28
28
 
29
29
  def test_search
30
30
  result = RDig.searcher.search 'some sample text'
31
- assert_equal 3, result[:hitcount]
32
- assert_equal 3, result[:list].size
31
+ assert_equal 5, result[:hitcount]
32
+ assert_equal 5, result[:list].size
33
33
  end
34
34
 
35
35
  end
@@ -74,7 +74,7 @@ class UrlFiltersTest < Test::Unit::TestCase
74
74
  doc = Document.create('http://test.host/dir/file.html')
75
75
  assert_equal('http://test.host/dir/another.html',
76
76
  UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
77
- assert_equal('http://test.host/dir/../another.html',
77
+ assert_equal('http://test.host/another.html',
78
78
  UrlFilters.fix_relative_uri(doc.create_child('../another.html')).uri.to_s)
79
79
  assert_equal('http://test.host/dir/another.html',
80
80
  UrlFilters.fix_relative_uri(doc.create_child('/dir/another.html')).uri.to_s)
metadata CHANGED
@@ -1,7 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rdig
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ hash: 3
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 3
9
+ - 8
10
+ version: 0.3.8
5
11
  platform: ruby
6
12
  authors:
7
13
  - Jens Kraemer
@@ -9,117 +15,157 @@ autorequire:
9
15
  bindir: bin
10
16
  cert_chain: []
11
17
 
12
- date: 2008-02-26 00:00:00 +01:00
13
- default_executable: rdig
18
+ date: 2009-04-26 00:00:00 +02:00
19
+ default_executable:
14
20
  dependencies:
15
21
  - !ruby/object:Gem::Dependency
16
22
  name: ferret
17
- version_requirement:
18
- version_requirements: !ruby/object:Gem::Requirement
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
19
26
  requirements:
20
27
  - - ">="
21
28
  - !ruby/object:Gem::Version
22
- version: 0.10.0
23
- version:
29
+ hash: 63
30
+ segments:
31
+ - 0
32
+ - 11
33
+ - 6
34
+ version: 0.11.6
35
+ type: :runtime
36
+ version_requirements: *id001
24
37
  - !ruby/object:Gem::Dependency
25
38
  name: hpricot
26
- version_requirement:
27
- version_requirements: !ruby/object:Gem::Requirement
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
28
42
  requirements:
29
43
  - - ">="
30
44
  - !ruby/object:Gem::Version
45
+ hash: 7
46
+ segments:
47
+ - 0
48
+ - 6
31
49
  version: "0.6"
32
- version:
33
- description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
50
+ type: :runtime
51
+ version_requirements: *id002
52
+ - !ruby/object:Gem::Dependency
53
+ name: htmlentities
54
+ prerelease: false
55
+ requirement: &id003 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 63
61
+ segments:
62
+ - 4
63
+ - 0
64
+ - 0
65
+ version: 4.0.0
66
+ type: :runtime
67
+ version_requirements: *id003
68
+ description: Website crawler and fulltext indexer.
34
69
  email: jk@jkraemer.net
35
- executables:
36
- - rdig
70
+ executables: []
71
+
37
72
  extensions: []
38
73
 
39
74
  extra_rdoc_files:
75
+ - History.txt
76
+ - Manifest.txt
40
77
  - README
78
+ files:
41
79
  - CHANGES
80
+ - History.txt
81
+ - install.rb
42
82
  - LICENSE
43
- - TODO
44
- files:
83
+ - Manifest.txt
84
+ - rakefile
85
+ - README
45
86
  - bin/rdig
46
- - lib/rdig.rb
47
- - lib/rdig
48
- - lib/rdig/url_filters.rb
49
- - lib/rdig/index.rb
50
- - lib/rdig/crawler.rb
87
+ - doc/examples/config.rb
88
+ - lib/rdig/content_extractors/doc.rb
89
+ - lib/rdig/content_extractors/hpricot.rb
90
+ - lib/rdig/content_extractors/pdf.rb
51
91
  - lib/rdig/content_extractors.rb
92
+ - lib/rdig/crawler.rb
93
+ - lib/rdig/documents.rb
52
94
  - lib/rdig/file.rb
53
95
  - lib/rdig/highlight.rb
54
- - lib/rdig/documents.rb
96
+ - lib/rdig/index.rb
55
97
  - lib/rdig/search.rb
56
- - lib/rdig/content_extractors
57
- - lib/rdig/content_extractors/doc.rb
58
- - lib/rdig/content_extractors/hpricot.rb
59
- - lib/rdig/content_extractors/pdf.rb
60
- - lib/htmlentities
61
- - lib/htmlentities/htmlentities.rb
62
- - lib/htmlentities/COPYING
63
- - lib/htmlentities/CHANGES
64
- - lib/htmlentities/README
65
- - test/fixtures
66
- - test/fixtures/word
67
- - test/fixtures/word/simple.doc
68
- - test/fixtures/html
98
+ - lib/rdig/url_filters.rb
99
+ - lib/rdig.rb
69
100
  - test/fixtures/html/custom_tag_selectors.html
70
- - test/fixtures/html/simple.html
71
101
  - test/fixtures/html/entities.html
72
- - test/fixtures/pdf
102
+ - test/fixtures/html/frameset.html
103
+ - test/fixtures/html/imagemap.html
104
+ - test/fixtures/html/simple.html
73
105
  - test/fixtures/pdf/simple.pdf
74
- - test/unit
106
+ - test/fixtures/word/simple.doc
107
+ - test/test_helper.rb
75
108
  - test/unit/crawler_fs_test.rb
109
+ - test/unit/etag_filter_test.rb
110
+ - test/unit/file_document_test.rb
111
+ - test/unit/hpricot_content_extractor_test.rb
112
+ - test/unit/http_document_test.rb
76
113
  - test/unit/pdf_content_extractor_test.rb
77
- - test/unit/word_content_extractor_test.rb
78
114
  - test/unit/rdig_test.rb
79
- - test/unit/http_document_test.rb
80
115
  - test/unit/searcher_test.rb
81
- - test/unit/file_document_test.rb
82
116
  - test/unit/url_filters_test.rb
83
- - test/unit/hpricot_content_extractor_test.rb
84
- - test/unit/etag_filter_test.rb
85
- - test/test_helper.rb
86
- - doc/examples
87
- - doc/examples/config.rb
88
- - LICENSE
89
- - TODO
90
- - CHANGES
91
- - README
92
- - install.rb
93
- - rakefile
117
+ - test/unit/word_content_extractor_test.rb
94
118
  has_rdoc: true
95
- homepage: http://rdig.rubyforge.org/
119
+ homepage: http://github.com/jkraemer/rdig/
120
+ licenses: []
121
+
96
122
  post_install_message:
97
123
  rdoc_options:
98
- - --title
99
- - Rake -- Ruby Make
100
124
  - --main
101
125
  - README
102
- - --line-numbers
103
126
  require_paths:
104
127
  - lib
105
128
  required_ruby_version: !ruby/object:Gem::Requirement
129
+ none: false
106
130
  requirements:
107
131
  - - ">="
108
132
  - !ruby/object:Gem::Version
133
+ hash: 3
134
+ segments:
135
+ - 0
109
136
  version: "0"
110
- version:
111
137
  required_rubygems_version: !ruby/object:Gem::Requirement
138
+ none: false
112
139
  requirements:
113
140
  - - ">="
114
141
  - !ruby/object:Gem::Version
142
+ hash: 3
143
+ segments:
144
+ - 0
115
145
  version: "0"
116
- version:
117
146
  requirements: []
118
147
 
119
148
  rubyforge_project: rdig
120
- rubygems_version: 1.0.1
149
+ rubygems_version: 1.5.3
121
150
  signing_key:
122
- specification_version: 2
123
- summary: Ruby based web site indexing and searching library.
124
- test_files: []
125
-
151
+ specification_version: 3
152
+ summary: Crawler and content extractor for building a full text index of a website's contents. Uses Ferret for indexing.
153
+ test_files:
154
+ - test/fixtures/html/custom_tag_selectors.html
155
+ - test/fixtures/html/entities.html
156
+ - test/fixtures/html/frameset.html
157
+ - test/fixtures/html/imagemap.html
158
+ - test/fixtures/html/simple.html
159
+ - test/fixtures/pdf/simple.pdf
160
+ - test/fixtures/word/simple.doc
161
+ - test/test_helper.rb
162
+ - test/unit/crawler_fs_test.rb
163
+ - test/unit/etag_filter_test.rb
164
+ - test/unit/file_document_test.rb
165
+ - test/unit/hpricot_content_extractor_test.rb
166
+ - test/unit/http_document_test.rb
167
+ - test/unit/pdf_content_extractor_test.rb
168
+ - test/unit/rdig_test.rb
169
+ - test/unit/searcher_test.rb
170
+ - test/unit/url_filters_test.rb
171
+ - test/unit/word_content_extractor_test.rb
data/TODO DELETED
File without changes
@@ -1,21 +0,0 @@
1
- == 2.2 (2005-11-07)
2
- * Important bug fixes -- thanks to Moonwolf
3
- * Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
4
- * Decimal decoding edge cases addressed.
5
- * Test cases added.
6
-
7
- == 2.1 (2005-10-31)
8
- * Removed some unnecessary code in basic entity encoding.
9
- * Improved handling of encoding: commands are now automatically sorted, so the
10
- user doesn't have to worry about their order.
11
- * Now using setup.rb.
12
- * Tests moved to separate file.
13
-
14
- == 2.0 (2005-08-23)
15
- * Added encoding to entities.
16
- * Decoding interface unchanged.
17
- * Fixed a bug with handling high codepoints.
18
-
19
- == 1.0 (2005-08-03)
20
- * Initial release.
21
- * Decoding only.
@@ -1,7 +0,0 @@
1
- Copyright (c) 2005 Paul Battley
2
-
3
- Usage of the works is permitted provided that this instrument is retained
4
- with the works, so that any entity that uses the works is notified of this
5
- instrument.
6
-
7
- DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
@@ -1,15 +0,0 @@
1
- HTML entity encoding and decoding for Ruby
2
-
3
- This library extends the String class to allow encoding and decoding of
4
- HTML/XML entities from/to their corresponding UTF-8 codepoints.
5
-
6
- To install (requires root/admin privileges):
7
-
8
- # ruby setup.rb
9
-
10
- To test:
11
-
12
- $ ruby setup.rb test
13
-
14
- Comments are welcome. Send an email to pbattley @ gmail.com.
15
-
@@ -1,281 +0,0 @@
1
- #
2
- # HTML entity encoding and decoding for Ruby
3
- #
4
- # Author:: Paul BATTLEY (pbattley @ gmail.com)
5
- # Version:: 2.2
6
- # Date:: 2005-11-07
7
- #
8
- # == About
9
- #
10
- # This library extends the String class to allow encoding and decoding of
11
- # HTML/XML entities from/to their corresponding UTF-8 codepoints.
12
- #
13
- # == Licence
14
- #
15
- # Copyright (c) 2005 Paul Battley
16
- #
17
- # Usage of the works is permitted provided that this instrument is retained
18
- # with the works, so that any entity that uses the works is notified of this
19
- # instrument.
20
- #
21
- # DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
22
- #
23
-
24
- module HTMLEntities
25
-
26
- VERSION = '2.2'
27
-
28
- #
29
- # MAP is a hash of all the HTML entities I could discover, as taken
30
- # from the w3schools page on the subject:
31
- # http://www.w3schools.com/html/html_entitiesref.asp
32
- # The format is 'entity name' => codepoint where entity name is given
33
- # without the surrounding ampersand and semicolon.
34
- #
35
- MAP = {
36
- 'quot' => 34,
37
- 'apos' => 39,
38
- 'amp' => 38,
39
- 'lt' => 60,
40
- 'gt' => 62,
41
- 'nbsp' => 160,
42
- 'iexcl' => 161,
43
- 'curren' => 164,
44
- 'cent' => 162,
45
- 'pound' => 163,
46
- 'yen' => 165,
47
- 'brvbar' => 166,
48
- 'sect' => 167,
49
- 'uml' => 168,
50
- 'copy' => 169,
51
- 'ordf' => 170,
52
- 'laquo' => 171,
53
- 'not' => 172,
54
- 'shy' => 173,
55
- 'reg' => 174,
56
- 'trade' => 8482,
57
- 'macr' => 175,
58
- 'deg' => 176,
59
- 'plusmn' => 177,
60
- 'sup2' => 178,
61
- 'sup3' => 179,
62
- 'acute' => 180,
63
- 'micro' => 181,
64
- 'para' => 182,
65
- 'middot' => 183,
66
- 'cedil' => 184,
67
- 'sup1' => 185,
68
- 'ordm' => 186,
69
- 'raquo' => 187,
70
- 'frac14' => 188,
71
- 'frac12' => 189,
72
- 'frac34' => 190,
73
- 'iquest' => 191,
74
- 'times' => 215,
75
- 'divide' => 247,
76
- 'Agrave' => 192,
77
- 'Aacute' => 193,
78
- 'Acirc' => 194,
79
- 'Atilde' => 195,
80
- 'Auml' => 196,
81
- 'Aring' => 197,
82
- 'AElig' => 198,
83
- 'Ccedil' => 199,
84
- 'Egrave' => 200,
85
- 'Eacute' => 201,
86
- 'Ecirc' => 202,
87
- 'Euml' => 203,
88
- 'Igrave' => 204,
89
- 'Iacute' => 205,
90
- 'Icirc' => 206,
91
- 'Iuml' => 207,
92
- 'ETH' => 208,
93
- 'Ntilde' => 209,
94
- 'Ograve' => 210,
95
- 'Oacute' => 211,
96
- 'Ocirc' => 212,
97
- 'Otilde' => 213,
98
- 'Ouml' => 214,
99
- 'Oslash' => 216,
100
- 'Ugrave' => 217,
101
- 'Uacute' => 218,
102
- 'Ucirc' => 219,
103
- 'Uuml' => 220,
104
- 'Yacute' => 221,
105
- 'THORN' => 222,
106
- 'szlig' => 223,
107
- 'agrave' => 224,
108
- 'aacute' => 225,
109
- 'acirc' => 226,
110
- 'atilde' => 227,
111
- 'auml' => 228,
112
- 'aring' => 229,
113
- 'aelig' => 230,
114
- 'ccedil' => 231,
115
- 'egrave' => 232,
116
- 'eacute' => 233,
117
- 'ecirc' => 234,
118
- 'euml' => 235,
119
- 'igrave' => 236,
120
- 'iacute' => 237,
121
- 'icirc' => 238,
122
- 'iuml' => 239,
123
- 'eth' => 240,
124
- 'ntilde' => 241,
125
- 'ograve' => 242,
126
- 'oacute' => 243,
127
- 'ocirc' => 244,
128
- 'otilde' => 245,
129
- 'ouml' => 246,
130
- 'oslash' => 248,
131
- 'ugrave' => 249,
132
- 'uacute' => 250,
133
- 'ucirc' => 251,
134
- 'uuml' => 252,
135
- 'yacute' => 253,
136
- 'thorn' => 254,
137
- 'yuml' => 255,
138
- 'OElig' => 338,
139
- 'oelig' => 339,
140
- 'Scaron' => 352,
141
- 'scaron' => 353,
142
- 'Yuml' => 376,
143
- 'circ' => 710,
144
- 'tilde' => 732,
145
- 'ensp' => 8194,
146
- 'emsp' => 8195,
147
- 'thinsp' => 8201,
148
- 'zwnj' => 8204,
149
- 'zwj' => 8205,
150
- 'lrm' => 8206,
151
- 'rlm' => 8207,
152
- 'ndash' => 8211,
153
- 'mdash' => 8212,
154
- 'lsquo' => 8216,
155
- 'rsquo' => 8217,
156
- 'sbquo' => 8218,
157
- 'ldquo' => 8220,
158
- 'rdquo' => 8221,
159
- 'bdquo' => 8222,
160
- 'dagger' => 8224,
161
- 'Dagger' => 8225,
162
- 'hellip' => 8230,
163
- 'permil' => 8240,
164
- 'lsaquo' => 8249,
165
- 'rsaquo' => 8250,
166
- 'euro' => 8364
167
- }
168
-
169
- MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
170
- MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
171
-
172
- # Precompile the regexp
173
- NAMED_ENTITY_REGEXP =
174
- /&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
175
-
176
- # Reverse map for converting characters to named entities
177
- REVERSE_MAP = MAP.invert
178
-
179
- BASIC_ENTITY_REGEXP = /[<>'"&]/
180
-
181
- UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
182
-
183
- end
184
-
185
- class String
186
-
187
- # Because there's no need to make the user worry about the order here,
188
- # let's handle it.
189
- ENCODE_ENTITIES_COMMAND_ORDER = {
190
- :basic => 0,
191
- :named => 1,
192
- :decimal => 2,
193
- :hexadecimal => 3
194
- }
195
-
196
- #
197
- # Decode XML and HTML 4.01 entities in a string into their UTF-8
198
- # equivalents. Obviously, if your string is not already in UTF-8, you'd
199
- # better convert it before using this method, or the output will be mixed
200
- # up.
201
- # Unknown named entities are not converted
202
- #
203
- def decode_entities
204
- return gsub(HTMLEntities::NAMED_ENTITY_REGEXP) {
205
- HTMLEntities::MAP.has_key?($1) ? [HTMLEntities::MAP[$1]].pack('U') : $&
206
- }.gsub(/&#([0-9]{1,7});/) {
207
- [$1.to_i].pack('U')
208
- }.gsub(/&#x([0-9a-f]{1,6});/i) {
209
- [$1.to_i(16)].pack('U')
210
- }
211
- end
212
-
213
- #
214
- # Encode codepoints into their corresponding entities. Various operations
215
- # are possible, and may be specified in order:
216
- #
217
- # :basic :: Convert the five XML entities ('"<>&)
218
- # :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
219
- # :decimal :: Convert non-ASCII characters to decimal entities (e.g. &#1234;)
220
- # :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # &#x12ab;)
221
- #
222
- # You can specify the commands in any order, but they will be executed in
223
- # the order listed above to ensure that entity ampersands are not
224
- # clobbered and that named entities are replaced before numeric ones.
225
- #
226
- # If no instructions are specified, :basic will be used.
227
- #
228
- # Examples:
229
- # str.encode_entities - XML-safe
230
- # str.encode_entities(:basic, :decimal) - XML-safe and 7-bit clean
231
- # str.encode_entities(:basic, :named, :decimal) - 7-bit clean, with all
232
- # non-ASCII characters replaced with their named entity where possible, and
233
- # decimal equivalents otherwise.
234
- #
235
- # Note: It is the program's responsibility to ensure that the string
236
- # contains valid UTF-8 before calling this method.
237
- #
238
- def encode_entities(*instructions)
239
- str = nil
240
- if (instructions.empty?)
241
- instructions = [:basic]
242
- else
243
- instructions.each do |instr|
244
- unless ENCODE_ENTITIES_COMMAND_ORDER[instr]
245
- raise RuntimeError, "unknown encode_entities command `#{instr.inspect}'"
246
- end
247
- end
248
- instructions.sort! { |a,b|
249
- ENCODE_ENTITIES_COMMAND_ORDER[a] <=>
250
- ENCODE_ENTITIES_COMMAND_ORDER[b]
251
- }
252
- end
253
- instructions.each do |instruction|
254
- case instruction
255
- when :basic
256
- # Handled as basic ASCII
257
- str = (str || self).gsub(HTMLEntities::BASIC_ENTITY_REGEXP) {
258
- # It's safe to use the simpler [0] here because we know
259
- # that the basic entities are ASCII.
260
- '&' << HTMLEntities::REVERSE_MAP[$&[0]] << ';'
261
- }
262
- when :named
263
- # Test everything except printable ASCII
264
- str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
265
- cp = $&.unpack('U')[0]
266
- (e = HTMLEntities::REVERSE_MAP[cp]) ? "&#{e};" : $&
267
- }
268
- when :decimal
269
- str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
270
- "&##{$&.unpack('U')[0]};"
271
- }
272
- when :hexadecimal
273
- str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
274
- "&#x#{$&.unpack('U')[0].to_s(16)};"
275
- }
276
- end
277
- end
278
- return str
279
- end
280
-
281
- end