rdig 0.3.5 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGES CHANGED
@@ -1,8 +1,13 @@
1
+ 0.3.6
2
+ - remove bundled htmlentities in favor of a gem dependency
3
+ - also extract links from area and frame tags
4
+ - fix etagfilter bug
5
+
1
6
  0.3.5
2
7
  - Add max_depth option to crawler configuration for limiting the crawl to a
3
8
  specific depth
4
9
  - add support for http proxies including basic authentication
5
- - remove rubyfoul_soup support
10
+ - remove rubyful_soup support
6
11
 
7
12
  0.3.4
8
13
 
@@ -0,0 +1,11 @@
1
+ == 0.3.8 2009-04-26
2
+
3
+ * bump up version
4
+
5
+ == 0.3.7 2009-04-26
6
+
7
+ * Gem spec for automatic gem building on github
8
+ * doc enhancements
9
+ * better uri-normalization, re-add result uri of redirection
10
+ into the queue instea of directly indexing the resulting
11
+ page
@@ -0,0 +1,39 @@
1
+ CHANGES
2
+ History.txt
3
+ install.rb
4
+ LICENSE
5
+ Manifest.txt
6
+ rakefile
7
+ README
8
+ bin/rdig
9
+ doc/examples/config.rb
10
+ lib/rdig/content_extractors/doc.rb
11
+ lib/rdig/content_extractors/hpricot.rb
12
+ lib/rdig/content_extractors/pdf.rb
13
+ lib/rdig/content_extractors.rb
14
+ lib/rdig/crawler.rb
15
+ lib/rdig/documents.rb
16
+ lib/rdig/file.rb
17
+ lib/rdig/highlight.rb
18
+ lib/rdig/index.rb
19
+ lib/rdig/search.rb
20
+ lib/rdig/url_filters.rb
21
+ lib/rdig.rb
22
+ test/fixtures/html/custom_tag_selectors.html
23
+ test/fixtures/html/entities.html
24
+ test/fixtures/html/frameset.html
25
+ test/fixtures/html/imagemap.html
26
+ test/fixtures/html/simple.html
27
+ test/fixtures/pdf/simple.pdf
28
+ test/fixtures/word/simple.doc
29
+ test/test_helper.rb
30
+ test/unit/crawler_fs_test.rb
31
+ test/unit/etag_filter_test.rb
32
+ test/unit/file_document_test.rb
33
+ test/unit/hpricot_content_extractor_test.rb
34
+ test/unit/http_document_test.rb
35
+ test/unit/pdf_content_extractor_test.rb
36
+ test/unit/rdig_test.rb
37
+ test/unit/searcher_test.rb
38
+ test/unit/url_filters_test.rb
39
+ test/unit/word_content_extractor_test.rb
@@ -86,6 +86,13 @@ RDig.configuration do |cfg|
86
86
  # Set to 0 to only index the start_urls.
87
87
  # cfg.crawler.max_depth = nil
88
88
 
89
+ # default index document to be appended to URIs ending with a trailing '/'
90
+ # cfg.crawler.normalize_uri.index_document = nil
91
+ # strip trailing '/' from URIs to avoid double indexing of pages referred by '
92
+ # Ignored if index_document is set.
93
+ # Not necessary when the server issues proper etags since the default etag filter will kill these doublettes.
94
+ # cfg.crawler.normalize_uri.remove_trailing_slash = nil
95
+
89
96
  # http proxy configuration
90
97
  # proxy url
91
98
  # cfg.crawler.http_proxy = nil
@@ -94,6 +101,9 @@ RDig.configuration do |cfg|
94
101
  # cfg.crawler.http_proxy_user = nil
95
102
  # proxy password
96
103
  # cfg.crawler.http_proxy_pass = nil
104
+ #
105
+ # to use basic auth without a proxy, use this syntax:
106
+ # cfg.crawler.open_uri_http_options = { :http_basic_authentication => [user, password] }
97
107
 
98
108
  # indexer options
99
109
 
@@ -24,7 +24,7 @@
24
24
  #++
25
25
  #
26
26
 
27
- RDIGVERSION = '0.3.5'
27
+ RDIGVERSION = '0.3.8'
28
28
 
29
29
 
30
30
  require 'thread'
@@ -49,7 +49,8 @@ rescue LoadError
49
49
  require 'ferret'
50
50
  end
51
51
 
52
- require 'htmlentities/htmlentities'
52
+
53
+ #require 'htmlentities/htmlentities'
53
54
 
54
55
 
55
56
  $KCODE = 'u'
@@ -60,17 +61,16 @@ module RDig
60
61
 
61
62
  class << self
62
63
 
63
- # the filter chains are for limiting the set of indexed documents.
64
- # there are two chain types - one for http, and one for file system
65
- # crawling.
66
- # a document has to survive all filters in the chain to get indexed.
64
+ # Filter chains are used by the crawler to limit the set of documents being indexed.
65
+ # There are two chains - one for http, and one for file system crawling.
66
+ # Each document has to survive all filters in the relevant chain to get indexed.
67
67
  def filter_chain
68
68
  @filter_chain ||= {
69
69
  # filter chain for http crawling
70
70
  :http => [
71
71
  :scheme_filter_http,
72
72
  :fix_relative_uri,
73
- :normalize_uri,
73
+ { :normalize_uri => :normalize_uri },
74
74
  { RDig::UrlFilters::DepthFilter => :max_depth },
75
75
  { :hostname_filter => :include_hosts },
76
76
  { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
@@ -120,7 +120,11 @@ module RDig
120
120
  :wait_before_leave => 10,
121
121
  :http_proxy => nil,
122
122
  :http_proxy_user => nil,
123
- :http_proxy_pass => nil
123
+ :http_proxy_pass => nil,
124
+ :normalize_uri => OpenStruct.new(
125
+ :index_document => nil,
126
+ :remove_trailing_slash => nil
127
+ )
124
128
  ),
125
129
  :content_extraction => OpenStruct.new(
126
130
  # settings for html content extraction (hpricot)
@@ -23,7 +23,13 @@ module RDig
23
23
  def self.extractor_instances
24
24
  @@extractor_instances ||= extractors.map { |ex_class|
25
25
  RDig.logger.info "initializing content extractor: #{ex_class}"
26
- ex_class.new(RDig.configuration.content_extraction) rescue nil
26
+ ex = nil
27
+ begin
28
+ ex = ex_class.new(RDig.configuration.content_extraction)
29
+ rescue Exception
30
+ RDig.logger.error "error: #{$!.message}\n#{$!.backtrace.join("\n")}"
31
+ end
32
+ ex
27
33
  }.compact
28
34
  end
29
35
 
@@ -13,25 +13,11 @@ module RDig
13
13
  @wvhtml = 'wvHtml'
14
14
  @pattern = /^application\/msword/
15
15
  # html extractor for parsing wvHtml output
16
- if defined?(HpricotContentExtractor)
17
- @html_extractor = HpricotContentExtractor.new(OpenStruct.new(
18
- :hpricot => OpenStruct.new(
19
- :content_tag_selector => 'body',
20
- :title_tag_selector => 'title'
21
- )))
22
- elsif defined?(RubyfulSoupContentExtractor)
23
- @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
24
- :rubyful_soup => OpenStruct.new(
25
- :content_tag_selector => lambda { |tagsoup|
26
- tagsoup.html.body
27
- },
28
- :title_tag_selector => lambda { |tagsoup|
29
- tagsoup.html.head.title
30
- }
31
- )))
32
- else
33
- raise "need at least one html content extractor - please install hpricot or rubyful_soup"
34
- end
16
+ @html_extractor = HpricotContentExtractor.new(OpenStruct.new(
17
+ :hpricot => OpenStruct.new(
18
+ :content_tag_selector => 'body',
19
+ :title_tag_selector => 'title'
20
+ )))
35
21
  # TODO: better: if $?.exitstatus == 127 (not found)
36
22
  @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
37
23
  end
@@ -1,11 +1,12 @@
1
1
  begin
2
2
  require 'hpricot'
3
+ require 'htmlentities'
3
4
  rescue LoadError
4
5
  require 'rubygems'
5
6
  require 'hpricot'
7
+ require 'htmlentities'
6
8
  end
7
9
 
8
- if defined?(Hpricot)
9
10
  module RDig
10
11
  module ContentExtractors
11
12
 
@@ -23,11 +24,12 @@ module RDig
23
24
  # :title => 'Title',
24
25
  # :links => [array of urls] }
25
26
  def process(content)
27
+ entities = HTMLEntities.new
26
28
  doc = Hpricot(content)
27
29
  {
28
- :title => extract_title(doc).decode_entities.strip,
30
+ :title => entities.decode(extract_title(doc)).strip,
29
31
  :links => extract_links(doc),
30
- :content => extract_content(doc).decode_entities
32
+ :content => entities.decode(extract_content(doc))
31
33
  }
32
34
  end
33
35
 
@@ -50,12 +52,14 @@ module RDig
50
52
  # extracts the href attributes of all a tags, except
51
53
  # internal links like <a href="#top">
52
54
  def extract_links(doc)
53
- (doc/'a').map { |link|
54
- href = link['href']
55
- CGI.unescapeHTML(href) if href && href !~ /^#/
56
- }.compact
55
+ {'a' => 'href', 'area' => 'href', 'frame' => 'src'}.map do |tag, attr|
56
+ (doc/tag).map do |tag|
57
+ value = tag[attr]
58
+ CGI.unescapeHTML(value) if value && value !~ /^#/
59
+ end
60
+ end.flatten.compact
57
61
  end
58
-
62
+
59
63
  # Extracts the title from the given html tree
60
64
  def extract_title(doc)
61
65
  the_title_tag = title_tag(doc)
@@ -85,6 +89,7 @@ module RDig
85
89
  def strip_comments(string)
86
90
  string.gsub Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), ''
87
91
  end
92
+
88
93
  def strip_tags(string)
89
94
  string.gsub! Regexp.new('<(script|style).*?>.*?<\/(script|style).*?>',
90
95
  Regexp::MULTILINE, 'u'), ''
@@ -98,4 +103,3 @@ module RDig
98
103
 
99
104
  end
100
105
  end
101
- end
@@ -5,7 +5,6 @@ module RDig
5
5
 
6
6
  def initialize(config = RDig.config, logger = RDig.logger)
7
7
  @documents = Queue.new
8
- @etag_filter = ETagFilter.new
9
8
  @logger = logger
10
9
  @config = config
11
10
  end
@@ -22,7 +21,8 @@ module RDig
22
21
  # check whether we are indexing on-disk or via http
23
22
  url_type = @config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
24
23
  chain_config = RDig.filter_chain[url_type]
25
-
24
+
25
+ @etag_filter = ETagFilter.new
26
26
  filterchain = UrlFilters::FilterChain.new(chain_config)
27
27
  @config.crawler.start_urls.each { |url| add_url(url, filterchain) }
28
28
 
@@ -52,13 +52,21 @@ module RDig
52
52
  def process_document(doc, filterchain)
53
53
  @logger.debug "processing document #{doc}"
54
54
  doc.fetch
55
- # add links from this document to the queue
56
- doc.content[:links].each { |url|
57
- add_url(url, filterchain, doc)
58
- } unless doc.content[:links].nil?
59
-
60
- return unless @etag_filter.apply(doc)
61
- add_to_index doc
55
+ case doc.status
56
+ when :success
57
+ if @etag_filter.apply(doc)
58
+ # add links from this document to the queue
59
+ doc.content[:links].each { |url|
60
+ add_url(url, filterchain, doc)
61
+ } unless doc.content[:links].nil?
62
+ add_to_index doc
63
+ end
64
+ when :redirect
65
+ @logger.debug "redirect to #{doc.content}"
66
+ add_url(doc.content, filterchain, doc)
67
+ else
68
+ @logger.error "unknown doc status #{doc.status}: #{doc}"
69
+ end
62
70
  rescue
63
71
  @logger.error "error processing document #{doc.uri.to_s}: #{$!}"
64
72
  @logger.debug "Trace: #{$!.backtrace.join("\n")}"
@@ -110,7 +118,7 @@ module RDig
110
118
  end
111
119
 
112
120
  def apply(document)
113
- return document unless (document.respond_to?(:etag) && document.etag)
121
+ return document unless (document.respond_to?(:etag) && document.etag && !document.etag.empty?)
114
122
  synchronize do
115
123
  @etags.add?(document.etag) ? document : nil
116
124
  end
@@ -118,16 +118,20 @@ module RDig
118
118
  def fetch
119
119
  RDig.logger.debug "fetching #{@uri.to_s}"
120
120
  open(@uri.to_s, RDig::open_uri_http_options) do |doc|
121
- case doc.status.first.to_i
122
- when 200
123
- @etag = doc.meta['etag']
124
- # puts "etag: #{@etag}"
125
- @content = ContentExtractors.process(doc.read, doc.content_type)
126
- @status = :success
127
- when 404
128
- RDig.logger.info "got 404 for #{@uri}"
121
+ if @uri.to_s != doc.base_uri.to_s
122
+ @status = :redirect
123
+ @content = doc.base_uri
129
124
  else
130
- RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
125
+ case doc.status.first.to_i
126
+ when 200
127
+ @etag = doc.meta['etag']
128
+ @content = ContentExtractors.process(doc.read, doc.content_type)
129
+ @status = :success
130
+ when 404
131
+ RDig.logger.info "got 404 for #{@uri}"
132
+ else
133
+ RDig.logger.info "don't know what to do with response: #{doc.status.join(' : ')}"
134
+ end
131
135
  end
132
136
  end
133
137
  rescue
@@ -22,7 +22,7 @@ module RDig
22
22
  end
23
23
 
24
24
  # add a filter and it's args to the chain
25
- # when args is a symbol, it is treated as a configuration key
25
+ # if args is a symbol, it is treated as a configuration key
26
26
  def add(filter, args=nil)
27
27
  args = RDig.config.crawler.send(args) if args.is_a? Symbol
28
28
  case filter
@@ -163,7 +163,7 @@ module RDig
163
163
  return document
164
164
  end
165
165
 
166
- # expands both href="/path/xyz.html" and href="affe.html"
166
+ # expands href="/path/xyz.html", href="affe.html" and href="../lala.html"
167
167
  # to full urls
168
168
  def UrlFilters.fix_relative_uri(document)
169
169
  #return nil unless document.uri.scheme.nil? || document.uri.scheme =~ /^https?/i
@@ -175,11 +175,13 @@ module RDig
175
175
  uri.port = ref.port unless uri.port || ref.port==ref.default_port
176
176
  uri.path = ref.path unless uri.path
177
177
 
178
- if uri.path !~ /^\//
178
+ old_uri_path = uri.path
179
+ if uri.path !~ /^\// || uri.path =~ /^\.\./
179
180
  ref_path = ref.path || '/'
180
181
  ref_path << '/' if ref_path.empty?
181
182
  uri.path = ref_path[0..ref_path.rindex('/')] + uri.path
182
- end
183
+ end
184
+ uri.path = uri.path.sub( /\/[^\/]*\/\.\./, "" ) if old_uri_path =~ /^\.\./
183
185
  return document
184
186
  rescue
185
187
  p document
@@ -193,12 +195,17 @@ module RDig
193
195
  return document if include_hosts.nil? || include_hosts.empty? || include_hosts.include?(document.uri.host)
194
196
  end
195
197
 
196
- def UrlFilters.normalize_uri(document)
198
+ def UrlFilters.normalize_uri(document, cfg)
197
199
  document.uri.fragment = nil
198
200
  # document.uri.query = nil
199
- # append index document if configured and path ends with a slash
200
- if RDig.config.index_document && document.uri.path =~ /\/$/
201
- document.uri.path << RDig.config.index_document
201
+ # trailing slash handling
202
+ if document.uri.path =~ /\/$/
203
+ # append index document if configured
204
+ if cfg.index_document
205
+ document.uri.path << RDig.config.index_document
206
+ elsif cfg.remove_trailing_slash
207
+ document.uri.path.gsub! /\/$/, ''
208
+ end
202
209
  end
203
210
  return document
204
211
  end
data/rakefile CHANGED
@@ -132,6 +132,7 @@ else
132
132
 
133
133
  s.add_dependency('ferret', '>= 0.10.0')
134
134
  s.add_dependency('hpricot', '>= 0.6')
135
+ s.add_dependency('htmlentities', '>= 4.0.0')
135
136
  #s.requirements << ""
136
137
 
137
138
  #### Which files are to be included in this gem? Everything! (Except CVS directories.)
@@ -321,7 +322,7 @@ task :tag => [:prerelease] do
321
322
  if ENV['RELTEST']
322
323
  announce "Release Task Testing, skipping tagging"
323
324
  else
324
- sh %{cd ..; svn copy trunk tags/#{reltag}}
325
+ sh %{svn copy svn+ssh://jkraemer@rubyforge.org/var/svn/rdig/trunk svn+ssh://jkraemer@rubyforge.org/var/svn/rdig/tags/#{reltag}}
325
326
  end
326
327
  end
327
328
 
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2
+ <html>
3
+ <head>
4
+ <meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
5
+ <title>Sample &amp; Title</title>
6
+ </head>
7
+ <body>
8
+ <frameset>
9
+ <frame src="http://test.host/first.html" />
10
+ <frame src="/second.html" />
11
+ </frameset>
12
+ </body>
13
+ </html>
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2
+ <html>
3
+ <head>
4
+ <meta http-equiv="content-type" content="text/html;charset=iso-8859-1">
5
+ <title>Sample &amp; Title</title>
6
+ </head>
7
+ <body>
8
+ <map>
9
+ <area href="http://test.host/first.html" />
10
+ <area href="/second.html" />
11
+ </map>
12
+ </body>
13
+ </html>
@@ -52,6 +52,18 @@ class HpricotContentExtractorTest < Test::Unit::TestCase
52
52
  assert_equal '/inside.html', result[:links][1]
53
53
  assert_equal '/footer.html', result[:links][2]
54
54
  end
55
+
56
+ def test_extracts_links_from_frameset
57
+ result = @extractor.process(html_doc('frameset'))
58
+ assert_equal 'http://test.host/first.html', result[:links].first
59
+ assert_equal '/second.html', result[:links].last
60
+ end
61
+
62
+ def test_extracts_links_from_imagemap
63
+ result = @extractor.process(html_doc('imagemap'))
64
+ assert_equal 'http://test.host/first.html', result[:links].first
65
+ assert_equal '/second.html', result[:links].last
66
+ end
55
67
 
56
68
 
57
69
  def test_title_from_dcmeta
@@ -28,8 +28,8 @@ class SearcherTest < Test::Unit::TestCase
28
28
 
29
29
  def test_search
30
30
  result = RDig.searcher.search 'some sample text'
31
- assert_equal 3, result[:hitcount]
32
- assert_equal 3, result[:list].size
31
+ assert_equal 5, result[:hitcount]
32
+ assert_equal 5, result[:list].size
33
33
  end
34
34
 
35
35
  end
@@ -74,7 +74,7 @@ class UrlFiltersTest < Test::Unit::TestCase
74
74
  doc = Document.create('http://test.host/dir/file.html')
75
75
  assert_equal('http://test.host/dir/another.html',
76
76
  UrlFilters.fix_relative_uri(doc.create_child('another.html')).uri.to_s)
77
- assert_equal('http://test.host/dir/../another.html',
77
+ assert_equal('http://test.host/another.html',
78
78
  UrlFilters.fix_relative_uri(doc.create_child('../another.html')).uri.to_s)
79
79
  assert_equal('http://test.host/dir/another.html',
80
80
  UrlFilters.fix_relative_uri(doc.create_child('/dir/another.html')).uri.to_s)
metadata CHANGED
@@ -1,7 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rdig
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ hash: 3
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 3
9
+ - 8
10
+ version: 0.3.8
5
11
  platform: ruby
6
12
  authors:
7
13
  - Jens Kraemer
@@ -9,117 +15,157 @@ autorequire:
9
15
  bindir: bin
10
16
  cert_chain: []
11
17
 
12
- date: 2008-02-26 00:00:00 +01:00
13
- default_executable: rdig
18
+ date: 2009-04-26 00:00:00 +02:00
19
+ default_executable:
14
20
  dependencies:
15
21
  - !ruby/object:Gem::Dependency
16
22
  name: ferret
17
- version_requirement:
18
- version_requirements: !ruby/object:Gem::Requirement
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
19
26
  requirements:
20
27
  - - ">="
21
28
  - !ruby/object:Gem::Version
22
- version: 0.10.0
23
- version:
29
+ hash: 63
30
+ segments:
31
+ - 0
32
+ - 11
33
+ - 6
34
+ version: 0.11.6
35
+ type: :runtime
36
+ version_requirements: *id001
24
37
  - !ruby/object:Gem::Dependency
25
38
  name: hpricot
26
- version_requirement:
27
- version_requirements: !ruby/object:Gem::Requirement
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
28
42
  requirements:
29
43
  - - ">="
30
44
  - !ruby/object:Gem::Version
45
+ hash: 7
46
+ segments:
47
+ - 0
48
+ - 6
31
49
  version: "0.6"
32
- version:
33
- description: RDig provides an HTTP crawler and content extraction utilities to help building a site search for web sites or intranets. Internally, Ferret is used for the full text indexing. After creating a config file for your site, the index can be built with a single call to rdig. For HTML page crawling, hpricot and rubyful_soup are supported.
50
+ type: :runtime
51
+ version_requirements: *id002
52
+ - !ruby/object:Gem::Dependency
53
+ name: htmlentities
54
+ prerelease: false
55
+ requirement: &id003 !ruby/object:Gem::Requirement
56
+ none: false
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ hash: 63
61
+ segments:
62
+ - 4
63
+ - 0
64
+ - 0
65
+ version: 4.0.0
66
+ type: :runtime
67
+ version_requirements: *id003
68
+ description: Website crawler and fulltext indexer.
34
69
  email: jk@jkraemer.net
35
- executables:
36
- - rdig
70
+ executables: []
71
+
37
72
  extensions: []
38
73
 
39
74
  extra_rdoc_files:
75
+ - History.txt
76
+ - Manifest.txt
40
77
  - README
78
+ files:
41
79
  - CHANGES
80
+ - History.txt
81
+ - install.rb
42
82
  - LICENSE
43
- - TODO
44
- files:
83
+ - Manifest.txt
84
+ - rakefile
85
+ - README
45
86
  - bin/rdig
46
- - lib/rdig.rb
47
- - lib/rdig
48
- - lib/rdig/url_filters.rb
49
- - lib/rdig/index.rb
50
- - lib/rdig/crawler.rb
87
+ - doc/examples/config.rb
88
+ - lib/rdig/content_extractors/doc.rb
89
+ - lib/rdig/content_extractors/hpricot.rb
90
+ - lib/rdig/content_extractors/pdf.rb
51
91
  - lib/rdig/content_extractors.rb
92
+ - lib/rdig/crawler.rb
93
+ - lib/rdig/documents.rb
52
94
  - lib/rdig/file.rb
53
95
  - lib/rdig/highlight.rb
54
- - lib/rdig/documents.rb
96
+ - lib/rdig/index.rb
55
97
  - lib/rdig/search.rb
56
- - lib/rdig/content_extractors
57
- - lib/rdig/content_extractors/doc.rb
58
- - lib/rdig/content_extractors/hpricot.rb
59
- - lib/rdig/content_extractors/pdf.rb
60
- - lib/htmlentities
61
- - lib/htmlentities/htmlentities.rb
62
- - lib/htmlentities/COPYING
63
- - lib/htmlentities/CHANGES
64
- - lib/htmlentities/README
65
- - test/fixtures
66
- - test/fixtures/word
67
- - test/fixtures/word/simple.doc
68
- - test/fixtures/html
98
+ - lib/rdig/url_filters.rb
99
+ - lib/rdig.rb
69
100
  - test/fixtures/html/custom_tag_selectors.html
70
- - test/fixtures/html/simple.html
71
101
  - test/fixtures/html/entities.html
72
- - test/fixtures/pdf
102
+ - test/fixtures/html/frameset.html
103
+ - test/fixtures/html/imagemap.html
104
+ - test/fixtures/html/simple.html
73
105
  - test/fixtures/pdf/simple.pdf
74
- - test/unit
106
+ - test/fixtures/word/simple.doc
107
+ - test/test_helper.rb
75
108
  - test/unit/crawler_fs_test.rb
109
+ - test/unit/etag_filter_test.rb
110
+ - test/unit/file_document_test.rb
111
+ - test/unit/hpricot_content_extractor_test.rb
112
+ - test/unit/http_document_test.rb
76
113
  - test/unit/pdf_content_extractor_test.rb
77
- - test/unit/word_content_extractor_test.rb
78
114
  - test/unit/rdig_test.rb
79
- - test/unit/http_document_test.rb
80
115
  - test/unit/searcher_test.rb
81
- - test/unit/file_document_test.rb
82
116
  - test/unit/url_filters_test.rb
83
- - test/unit/hpricot_content_extractor_test.rb
84
- - test/unit/etag_filter_test.rb
85
- - test/test_helper.rb
86
- - doc/examples
87
- - doc/examples/config.rb
88
- - LICENSE
89
- - TODO
90
- - CHANGES
91
- - README
92
- - install.rb
93
- - rakefile
117
+ - test/unit/word_content_extractor_test.rb
94
118
  has_rdoc: true
95
- homepage: http://rdig.rubyforge.org/
119
+ homepage: http://github.com/jkraemer/rdig/
120
+ licenses: []
121
+
96
122
  post_install_message:
97
123
  rdoc_options:
98
- - --title
99
- - Rake -- Ruby Make
100
124
  - --main
101
125
  - README
102
- - --line-numbers
103
126
  require_paths:
104
127
  - lib
105
128
  required_ruby_version: !ruby/object:Gem::Requirement
129
+ none: false
106
130
  requirements:
107
131
  - - ">="
108
132
  - !ruby/object:Gem::Version
133
+ hash: 3
134
+ segments:
135
+ - 0
109
136
  version: "0"
110
- version:
111
137
  required_rubygems_version: !ruby/object:Gem::Requirement
138
+ none: false
112
139
  requirements:
113
140
  - - ">="
114
141
  - !ruby/object:Gem::Version
142
+ hash: 3
143
+ segments:
144
+ - 0
115
145
  version: "0"
116
- version:
117
146
  requirements: []
118
147
 
119
148
  rubyforge_project: rdig
120
- rubygems_version: 1.0.1
149
+ rubygems_version: 1.5.3
121
150
  signing_key:
122
- specification_version: 2
123
- summary: Ruby based web site indexing and searching library.
124
- test_files: []
125
-
151
+ specification_version: 3
152
+ summary: Crawler and content extractor for building a full text index of a website's contents. Uses Ferret for indexing.
153
+ test_files:
154
+ - test/fixtures/html/custom_tag_selectors.html
155
+ - test/fixtures/html/entities.html
156
+ - test/fixtures/html/frameset.html
157
+ - test/fixtures/html/imagemap.html
158
+ - test/fixtures/html/simple.html
159
+ - test/fixtures/pdf/simple.pdf
160
+ - test/fixtures/word/simple.doc
161
+ - test/test_helper.rb
162
+ - test/unit/crawler_fs_test.rb
163
+ - test/unit/etag_filter_test.rb
164
+ - test/unit/file_document_test.rb
165
+ - test/unit/hpricot_content_extractor_test.rb
166
+ - test/unit/http_document_test.rb
167
+ - test/unit/pdf_content_extractor_test.rb
168
+ - test/unit/rdig_test.rb
169
+ - test/unit/searcher_test.rb
170
+ - test/unit/url_filters_test.rb
171
+ - test/unit/word_content_extractor_test.rb
data/TODO DELETED
File without changes
@@ -1,21 +0,0 @@
1
- == 2.2 (2005-11-07)
2
- * Important bug fixes -- thanks to Moonwolf
3
- * Decoding hexadecimal entities now accepts 'f' as a hex digit. (D'oh!)
4
- * Decimal decoding edge cases addressed.
5
- * Test cases added.
6
-
7
- == 2.1 (2005-10-31)
8
- * Removed some unnecessary code in basic entity encoding.
9
- * Improved handling of encoding: commands are now automatically sorted, so the
10
- user doesn't have to worry about their order.
11
- * Now using setup.rb.
12
- * Tests moved to separate file.
13
-
14
- == 2.0 (2005-08-23)
15
- * Added encoding to entities.
16
- * Decoding interface unchanged.
17
- * Fixed a bug with handling high codepoints.
18
-
19
- == 1.0 (2005-08-03)
20
- * Initial release.
21
- * Decoding only.
@@ -1,7 +0,0 @@
1
- Copyright (c) 2005 Paul Battley
2
-
3
- Usage of the works is permitted provided that this instrument is retained
4
- with the works, so that any entity that uses the works is notified of this
5
- instrument.
6
-
7
- DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
@@ -1,15 +0,0 @@
1
- HTML entity encoding and decoding for Ruby
2
-
3
- This library extends the String class to allow encoding and decoding of
4
- HTML/XML entities from/to their corresponding UTF-8 codepoints.
5
-
6
- To install (requires root/admin privileges):
7
-
8
- # ruby setup.rb
9
-
10
- To test:
11
-
12
- $ ruby setup.rb test
13
-
14
- Comments are welcome. Send an email to pbattley @ gmail.com.
15
-
@@ -1,281 +0,0 @@
1
- #
2
- # HTML entity encoding and decoding for Ruby
3
- #
4
- # Author:: Paul BATTLEY (pbattley @ gmail.com)
5
- # Version:: 2.2
6
- # Date:: 2005-11-07
7
- #
8
- # == About
9
- #
10
- # This library extends the String class to allow encoding and decoding of
11
- # HTML/XML entities from/to their corresponding UTF-8 codepoints.
12
- #
13
- # == Licence
14
- #
15
- # Copyright (c) 2005 Paul Battley
16
- #
17
- # Usage of the works is permitted provided that this instrument is retained
18
- # with the works, so that any entity that uses the works is notified of this
19
- # instrument.
20
- #
21
- # DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
22
- #
23
-
24
- module HTMLEntities
25
-
26
- VERSION = '2.2'
27
-
28
- #
29
- # MAP is a hash of all the HTML entities I could discover, as taken
30
- # from the w3schools page on the subject:
31
- # http://www.w3schools.com/html/html_entitiesref.asp
32
- # The format is 'entity name' => codepoint where entity name is given
33
- # without the surrounding ampersand and semicolon.
34
- #
35
- MAP = {
36
- 'quot' => 34,
37
- 'apos' => 39,
38
- 'amp' => 38,
39
- 'lt' => 60,
40
- 'gt' => 62,
41
- 'nbsp' => 160,
42
- 'iexcl' => 161,
43
- 'curren' => 164,
44
- 'cent' => 162,
45
- 'pound' => 163,
46
- 'yen' => 165,
47
- 'brvbar' => 166,
48
- 'sect' => 167,
49
- 'uml' => 168,
50
- 'copy' => 169,
51
- 'ordf' => 170,
52
- 'laquo' => 171,
53
- 'not' => 172,
54
- 'shy' => 173,
55
- 'reg' => 174,
56
- 'trade' => 8482,
57
- 'macr' => 175,
58
- 'deg' => 176,
59
- 'plusmn' => 177,
60
- 'sup2' => 178,
61
- 'sup3' => 179,
62
- 'acute' => 180,
63
- 'micro' => 181,
64
- 'para' => 182,
65
- 'middot' => 183,
66
- 'cedil' => 184,
67
- 'sup1' => 185,
68
- 'ordm' => 186,
69
- 'raquo' => 187,
70
- 'frac14' => 188,
71
- 'frac12' => 189,
72
- 'frac34' => 190,
73
- 'iquest' => 191,
74
- 'times' => 215,
75
- 'divide' => 247,
76
- 'Agrave' => 192,
77
- 'Aacute' => 193,
78
- 'Acirc' => 194,
79
- 'Atilde' => 195,
80
- 'Auml' => 196,
81
- 'Aring' => 197,
82
- 'AElig' => 198,
83
- 'Ccedil' => 199,
84
- 'Egrave' => 200,
85
- 'Eacute' => 201,
86
- 'Ecirc' => 202,
87
- 'Euml' => 203,
88
- 'Igrave' => 204,
89
- 'Iacute' => 205,
90
- 'Icirc' => 206,
91
- 'Iuml' => 207,
92
- 'ETH' => 208,
93
- 'Ntilde' => 209,
94
- 'Ograve' => 210,
95
- 'Oacute' => 211,
96
- 'Ocirc' => 212,
97
- 'Otilde' => 213,
98
- 'Ouml' => 214,
99
- 'Oslash' => 216,
100
- 'Ugrave' => 217,
101
- 'Uacute' => 218,
102
- 'Ucirc' => 219,
103
- 'Uuml' => 220,
104
- 'Yacute' => 221,
105
- 'THORN' => 222,
106
- 'szlig' => 223,
107
- 'agrave' => 224,
108
- 'aacute' => 225,
109
- 'acirc' => 226,
110
- 'atilde' => 227,
111
- 'auml' => 228,
112
- 'aring' => 229,
113
- 'aelig' => 230,
114
- 'ccedil' => 231,
115
- 'egrave' => 232,
116
- 'eacute' => 233,
117
- 'ecirc' => 234,
118
- 'euml' => 235,
119
- 'igrave' => 236,
120
- 'iacute' => 237,
121
- 'icirc' => 238,
122
- 'iuml' => 239,
123
- 'eth' => 240,
124
- 'ntilde' => 241,
125
- 'ograve' => 242,
126
- 'oacute' => 243,
127
- 'ocirc' => 244,
128
- 'otilde' => 245,
129
- 'ouml' => 246,
130
- 'oslash' => 248,
131
- 'ugrave' => 249,
132
- 'uacute' => 250,
133
- 'ucirc' => 251,
134
- 'uuml' => 252,
135
- 'yacute' => 253,
136
- 'thorn' => 254,
137
- 'yuml' => 255,
138
- 'OElig' => 338,
139
- 'oelig' => 339,
140
- 'Scaron' => 352,
141
- 'scaron' => 353,
142
- 'Yuml' => 376,
143
- 'circ' => 710,
144
- 'tilde' => 732,
145
- 'ensp' => 8194,
146
- 'emsp' => 8195,
147
- 'thinsp' => 8201,
148
- 'zwnj' => 8204,
149
- 'zwj' => 8205,
150
- 'lrm' => 8206,
151
- 'rlm' => 8207,
152
- 'ndash' => 8211,
153
- 'mdash' => 8212,
154
- 'lsquo' => 8216,
155
- 'rsquo' => 8217,
156
- 'sbquo' => 8218,
157
- 'ldquo' => 8220,
158
- 'rdquo' => 8221,
159
- 'bdquo' => 8222,
160
- 'dagger' => 8224,
161
- 'Dagger' => 8225,
162
- 'hellip' => 8230,
163
- 'permil' => 8240,
164
- 'lsaquo' => 8249,
165
- 'rsaquo' => 8250,
166
- 'euro' => 8364
167
- }
168
-
169
- MIN_LENGTH = MAP.keys.map{ |a| a.length }.min
170
- MAX_LENGTH = MAP.keys.map{ |a| a.length }.max
171
-
172
- # Precompile the regexp
173
- NAMED_ENTITY_REGEXP =
174
- /&([a-z]{#{HTMLEntities::MIN_LENGTH},#{HTMLEntities::MAX_LENGTH}});/i
175
-
176
- # Reverse map for converting characters to named entities
177
- REVERSE_MAP = MAP.invert
178
-
179
- BASIC_ENTITY_REGEXP = /[<>'"&]/
180
-
181
- UTF8_NON_ASCII_REGEXP = /[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+/
182
-
183
- end
184
-
185
- class String
186
-
187
- # Because there's no need to make the user worry about the order here,
188
- # let's handle it.
189
- ENCODE_ENTITIES_COMMAND_ORDER = {
190
- :basic => 0,
191
- :named => 1,
192
- :decimal => 2,
193
- :hexadecimal => 3
194
- }
195
-
196
- #
197
- # Decode XML and HTML 4.01 entities in a string into their UTF-8
198
- # equivalents. Obviously, if your string is not already in UTF-8, you'd
199
- # better convert it before using this method, or the output will be mixed
200
- # up.
201
- # Unknown named entities are not converted
202
- #
203
- def decode_entities
204
- return gsub(HTMLEntities::NAMED_ENTITY_REGEXP) {
205
- HTMLEntities::MAP.has_key?($1) ? [HTMLEntities::MAP[$1]].pack('U') : $&
206
- }.gsub(/&#([0-9]{1,7});/) {
207
- [$1.to_i].pack('U')
208
- }.gsub(/&#x([0-9a-f]{1,6});/i) {
209
- [$1.to_i(16)].pack('U')
210
- }
211
- end
212
-
213
- #
214
- # Encode codepoints into their corresponding entities. Various operations
215
- # are possible, and may be specified in order:
216
- #
217
- # :basic :: Convert the five XML entities ('"<>&)
218
- # :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
219
- # :decimal :: Convert non-ASCII characters to decimal entities (e.g. &#1234;)
220
- # :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # &#x12ab;)
221
- #
222
- # You can specify the commands in any order, but they will be executed in
223
- # the order listed above to ensure that entity ampersands are not
224
- # clobbered and that named entities are replaced before numeric ones.
225
- #
226
- # If no instructions are specified, :basic will be used.
227
- #
228
- # Examples:
229
- # str.encode_entities - XML-safe
230
- # str.encode_entities(:basic, :decimal) - XML-safe and 7-bit clean
231
- # str.encode_entities(:basic, :named, :decimal) - 7-bit clean, with all
232
- # non-ASCII characters replaced with their named entity where possible, and
233
- # decimal equivalents otherwise.
234
- #
235
- # Note: It is the program's responsibility to ensure that the string
236
- # contains valid UTF-8 before calling this method.
237
- #
238
- def encode_entities(*instructions)
239
- str = nil
240
- if (instructions.empty?)
241
- instructions = [:basic]
242
- else
243
- instructions.each do |instr|
244
- unless ENCODE_ENTITIES_COMMAND_ORDER[instr]
245
- raise RuntimeError, "unknown encode_entities command `#{instr.inspect}'"
246
- end
247
- end
248
- instructions.sort! { |a,b|
249
- ENCODE_ENTITIES_COMMAND_ORDER[a] <=>
250
- ENCODE_ENTITIES_COMMAND_ORDER[b]
251
- }
252
- end
253
- instructions.each do |instruction|
254
- case instruction
255
- when :basic
256
- # Handled as basic ASCII
257
- str = (str || self).gsub(HTMLEntities::BASIC_ENTITY_REGEXP) {
258
- # It's safe to use the simpler [0] here because we know
259
- # that the basic entities are ASCII.
260
- '&' << HTMLEntities::REVERSE_MAP[$&[0]] << ';'
261
- }
262
- when :named
263
- # Test everything except printable ASCII
264
- str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
265
- cp = $&.unpack('U')[0]
266
- (e = HTMLEntities::REVERSE_MAP[cp]) ? "&#{e};" : $&
267
- }
268
- when :decimal
269
- str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
270
- "&##{$&.unpack('U')[0]};"
271
- }
272
- when :hexadecimal
273
- str = (str || self).gsub(HTMLEntities::UTF8_NON_ASCII_REGEXP) {
274
- "&#x#{$&.unpack('U')[0].to_s(16)};"
275
- }
276
- end
277
- end
278
- return str
279
- end
280
-
281
- end