sphinxcrawl 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +7 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +89 -0
- data/Guardfile +16 -0
- data/LICENSE.txt +22 -0
- data/README.markdown +69 -0
- data/README.rdoc +13 -0
- data/Rakefile +66 -0
- data/bin/sphinxcrawl +35 -0
- data/example.conf +14 -0
- data/features/sphinxcrawl.feature +70 -0
- data/features/step_definitions/sphinxcrawl_steps.rb +1 -0
- data/features/support/env.rb +16 -0
- data/lib/sphinxcrawl.rb +9 -0
- data/lib/sphinxcrawl/crawler.rb +36 -0
- data/lib/sphinxcrawl/file_crawler.rb +20 -0
- data/lib/sphinxcrawl/page.rb +58 -0
- data/lib/sphinxcrawl/stream.rb +85 -0
- data/lib/sphinxcrawl/version.rb +3 -0
- data/lib/sphinxcrawl/web_crawler.rb +43 -0
- data/spec/fixtures/child_file.html +11 -0
- data/spec/fixtures/child_web.html +11 -0
- data/spec/fixtures/index_file.html +12 -0
- data/spec/fixtures/index_web.html +11 -0
- data/spec/fixtures/tree_file.html +12 -0
- data/spec/fixtures/tree_web.html +12 -0
- data/spec/spec_helper.rb +2 -0
- data/spec/sphinxcrawl/file_crawler_spec.rb +23 -0
- data/spec/sphinxcrawl/page_spec.rb +23 -0
- data/spec/sphinxcrawl/stream_spec.rb +20 -0
- data/spec/sphinxcrawl/web_crawler_spec.rb +32 -0
- data/sphinxcrawl.gemspec +26 -0
- metadata +194 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in sphinxcrawl.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem 'guard'
|
7
|
+
gem 'guard-rspec'
|
8
|
+
gem 'guard-ctags-bundler'
|
9
|
+
gem 'guard-cucumber'
|
10
|
+
|
11
|
+
group :darwin do
|
12
|
+
gem 'rb-fsevent'
|
13
|
+
end
|
14
|
+
|
15
|
+
group :linux do
|
16
|
+
gem 'rb-inotify'
|
17
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
sphinxcrawl (0.0.1)
|
5
|
+
methadone (~> 1.2.2)
|
6
|
+
nokogiri (~> 1.5.5)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
addressable (2.3.2)
|
12
|
+
aruba (0.5.0)
|
13
|
+
childprocess (= 0.2.3)
|
14
|
+
cucumber (>= 1.1.1)
|
15
|
+
ffi (>= 1.0.11)
|
16
|
+
rspec-expectations (>= 2.7.0)
|
17
|
+
builder (3.1.4)
|
18
|
+
childprocess (0.2.3)
|
19
|
+
ffi (~> 1.0.6)
|
20
|
+
coderay (1.0.8)
|
21
|
+
crack (0.3.1)
|
22
|
+
cucumber (1.2.1)
|
23
|
+
builder (>= 2.1.2)
|
24
|
+
diff-lcs (>= 1.1.3)
|
25
|
+
gherkin (~> 2.11.0)
|
26
|
+
json (>= 1.4.6)
|
27
|
+
diff-lcs (1.1.3)
|
28
|
+
ffi (1.0.11)
|
29
|
+
gherkin (2.11.5)
|
30
|
+
json (>= 1.4.6)
|
31
|
+
guard (1.5.3)
|
32
|
+
listen (>= 0.4.2)
|
33
|
+
lumberjack (>= 1.0.2)
|
34
|
+
pry (>= 0.9.10)
|
35
|
+
thor (>= 0.14.6)
|
36
|
+
guard-ctags-bundler (0.1.6)
|
37
|
+
guard (>= 1.1)
|
38
|
+
guard-cucumber (1.2.2)
|
39
|
+
cucumber (>= 1.2.0)
|
40
|
+
guard (>= 1.1.0)
|
41
|
+
guard-rspec (2.1.1)
|
42
|
+
guard (>= 1.1)
|
43
|
+
rspec (~> 2.11)
|
44
|
+
json (1.7.5)
|
45
|
+
listen (0.5.3)
|
46
|
+
lumberjack (1.0.2)
|
47
|
+
methadone (1.2.2)
|
48
|
+
bundler
|
49
|
+
method_source (0.8.1)
|
50
|
+
nokogiri (1.5.5)
|
51
|
+
pry (0.9.10)
|
52
|
+
coderay (~> 1.0.5)
|
53
|
+
method_source (~> 0.8)
|
54
|
+
slop (~> 3.3.1)
|
55
|
+
rake (0.9.2.2)
|
56
|
+
rb-fsevent (0.9.2)
|
57
|
+
rb-inotify (0.8.8)
|
58
|
+
ffi (>= 0.5.0)
|
59
|
+
rdoc (3.12)
|
60
|
+
json (~> 1.4)
|
61
|
+
rspec (2.11.0)
|
62
|
+
rspec-core (~> 2.11.0)
|
63
|
+
rspec-expectations (~> 2.11.0)
|
64
|
+
rspec-mocks (~> 2.11.0)
|
65
|
+
rspec-core (2.11.1)
|
66
|
+
rspec-expectations (2.11.3)
|
67
|
+
diff-lcs (~> 1.1.3)
|
68
|
+
rspec-mocks (2.11.3)
|
69
|
+
slop (3.3.3)
|
70
|
+
thor (0.16.0)
|
71
|
+
webmock (1.8.10)
|
72
|
+
addressable (>= 2.2.7)
|
73
|
+
crack (>= 0.1.7)
|
74
|
+
|
75
|
+
PLATFORMS
|
76
|
+
ruby
|
77
|
+
|
78
|
+
DEPENDENCIES
|
79
|
+
aruba
|
80
|
+
guard
|
81
|
+
guard-ctags-bundler
|
82
|
+
guard-cucumber
|
83
|
+
guard-rspec
|
84
|
+
rake
|
85
|
+
rb-fsevent
|
86
|
+
rb-inotify
|
87
|
+
rdoc
|
88
|
+
sphinxcrawl!
|
89
|
+
webmock
|
data/Guardfile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
guard 'rspec' do
|
2
|
+
watch(%r{^spec/.+_spec\.rb$})
|
3
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
4
|
+
watch('spec/spec_helper.rb') { 'spec' }
|
5
|
+
end
|
6
|
+
|
7
|
+
guard 'ctags-bundler', :src_path => ['lib', 'spec'] do
|
8
|
+
watch(/^(lib)\/.*\.rb$/)
|
9
|
+
watch('Gemfile.lock')
|
10
|
+
end
|
11
|
+
|
12
|
+
guard 'cucumber' do
|
13
|
+
watch(%r{^features/.+\.feature$})
|
14
|
+
watch(%r{^features/support/.+$}) { 'features' }
|
15
|
+
watch(%r{^features/step_definitions/(.+)_steps\.rb$}) { |m| Dir[File.join("**/#{m[1]}.feature")][0] || 'features' }
|
16
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Jon Doveston
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
# Sphinxcrawl
|
2
|
+
|
3
|
+
Simple command line tool gem to crawl a website and generate an xml stream for a
|
4
|
+
sphinx search index.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Install it with:
|
9
|
+
|
10
|
+
$ gem install sphinxcrawl
|
11
|
+
|
12
|
+
## Usage
|
13
|
+
|
14
|
+
After installation the sphinxcrawl command should be available.
|
15
|
+
|
16
|
+
Create a sphinx.conf file, for example
|
17
|
+
|
18
|
+
|
19
|
+
source page
|
20
|
+
{
|
21
|
+
type = xmlpipe2
|
22
|
+
xmlpipe_command = sphinxcrawl http://www.example.com -d 2 2>/dev/null
|
23
|
+
}
|
24
|
+
|
25
|
+
index page
|
26
|
+
{
|
27
|
+
source = page
|
28
|
+
path = sphinx/page
|
29
|
+
morphology = stem_en
|
30
|
+
charset_type = utf-8
|
31
|
+
html_strip = 1
|
32
|
+
}
|
33
|
+
|
34
|
+
Install sphinx, with debianoid linuxes this is
|
35
|
+
|
36
|
+
sudo apt-get install sphinxsearch
|
37
|
+
|
38
|
+
You should now have the indexer and search commands
|
39
|
+
|
40
|
+
indexer -c sphinx.conf page
|
41
|
+
|
42
|
+
and you can search in the index with
|
43
|
+
|
44
|
+
search -c sphinx.conf name
|
45
|
+
|
46
|
+
## Requirements
|
47
|
+
|
48
|
+
This gem is not Google! It uses nokogiri to parse html so will fail on badly
|
49
|
+
formed html. Also more importantly it only indexes part of the page marked with
|
50
|
+
a specific html attribute. This means you can only index sites that you control.
|
51
|
+
For example
|
52
|
+
|
53
|
+
...
|
54
|
+
<div data-field="description">
|
55
|
+
<p>This is a description</p>
|
56
|
+
</div>
|
57
|
+
...
|
58
|
+
|
59
|
+
will index the text inside the div tag (stripping other tags) and put the text
|
60
|
+
in a sphinx field called description. All fields are aggregated from all the
|
61
|
+
pages from a site crawl.
|
62
|
+
|
63
|
+
## Contributing
|
64
|
+
|
65
|
+
1. Fork it
|
66
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
67
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
68
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
69
|
+
5. Create new Pull Request
|
data/README.rdoc
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
= sphinxcrawl - html crawler and sphinx xmlstream generator
|
2
|
+
|
3
|
+
Author:: Jon Doveston (jon@doveston.me.uk)
|
4
|
+
Copyright:: Copyright (c) 2012 Jon Doveston
|
5
|
+
|
6
|
+
|
7
|
+
A command line gem that can crawl html (files or http) and generate an xml
|
8
|
+
stream for sphinx search.
|
9
|
+
|
10
|
+
== Links
|
11
|
+
|
12
|
+
* {Source on Github}[https://github.com/hatoishi/sphinxcrawl]
|
13
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
def dump_load_path
|
2
|
+
puts $LOAD_PATH.join("\n")
|
3
|
+
found = nil
|
4
|
+
$LOAD_PATH.each do |path|
|
5
|
+
if File.exists?(File.join(path,"rspec"))
|
6
|
+
puts "Found rspec in #{path}"
|
7
|
+
if File.exists?(File.join(path,"rspec","core"))
|
8
|
+
puts "Found core"
|
9
|
+
if File.exists?(File.join(path,"rspec","core","rake_task"))
|
10
|
+
puts "Found rake_task"
|
11
|
+
found = path
|
12
|
+
else
|
13
|
+
puts "!! no rake_task"
|
14
|
+
end
|
15
|
+
else
|
16
|
+
puts "!!! no core"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
if found.nil?
|
21
|
+
puts "Didn't find rspec/core/rake_task anywhere"
|
22
|
+
else
|
23
|
+
puts "Found in #{path}"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
require 'bundler'
|
27
|
+
require 'rake/clean'
|
28
|
+
|
29
|
+
begin
|
30
|
+
require 'rspec/core/rake_task'
|
31
|
+
rescue LoadError
|
32
|
+
dump_load_path
|
33
|
+
raise
|
34
|
+
end
|
35
|
+
|
36
|
+
require 'cucumber'
|
37
|
+
require 'cucumber/rake/task'
|
38
|
+
gem 'rdoc' # we need the installed RDoc gem, not the system one
|
39
|
+
require 'rdoc/task'
|
40
|
+
|
41
|
+
include Rake::DSL
|
42
|
+
|
43
|
+
Bundler::GemHelper.install_tasks
|
44
|
+
|
45
|
+
|
46
|
+
RSpec::Core::RakeTask.new do |t|
|
47
|
+
# Put spec opts in a file named .rspec in root
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
CUKE_RESULTS = 'results.html'
|
52
|
+
CLEAN << CUKE_RESULTS
|
53
|
+
Cucumber::Rake::Task.new(:features) do |t|
|
54
|
+
t.cucumber_opts = "features --format html -o #{CUKE_RESULTS} --format pretty --no-source -x"
|
55
|
+
t.fork = false
|
56
|
+
end
|
57
|
+
|
58
|
+
Rake::RDocTask.new do |rd|
|
59
|
+
|
60
|
+
rd.main = "README.rdoc"
|
61
|
+
|
62
|
+
rd.rdoc_files.include("README.rdoc","lib/**/*.rb","bin/**/*")
|
63
|
+
end
|
64
|
+
|
65
|
+
task :default => [:spec,:features]
|
66
|
+
|
data/bin/sphinxcrawl
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'methadone'
|
5
|
+
require 'sphinxcrawl'
|
6
|
+
|
7
|
+
class App
|
8
|
+
include Methadone::Main
|
9
|
+
include Methadone::CLILogging
|
10
|
+
|
11
|
+
main do |index|
|
12
|
+
index ||= 'index.html'
|
13
|
+
if index =~ /http:/
|
14
|
+
crawler = Sphinxcrawl::WebCrawler.new(index, options[:depth].to_i)
|
15
|
+
else
|
16
|
+
exit_now!(1, "file #{index} not found") unless File.exists?(index)
|
17
|
+
index = index+'/index.html' if File.directory?(index)
|
18
|
+
exit_now!(1, "file #{index} not found") unless File.exists?(index)
|
19
|
+
crawler = Sphinxcrawl::FileCrawler.new(index, options[:depth].to_i)
|
20
|
+
end
|
21
|
+
pages = crawler.pages
|
22
|
+
stream = Sphinxcrawl::Stream.new(pages)
|
23
|
+
puts stream.to_xml
|
24
|
+
end
|
25
|
+
|
26
|
+
description 'Generate sphinx xml stream from html pages'
|
27
|
+
arg :index, :optional
|
28
|
+
|
29
|
+
options['depth'] = '0'
|
30
|
+
on('-d depth', '--depth', 'Crawl depth', /^\d+$/)
|
31
|
+
|
32
|
+
version Sphinxcrawl::VERSION
|
33
|
+
use_log_level_option
|
34
|
+
go!
|
35
|
+
end
|
data/example.conf
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
source page
|
2
|
+
{
|
3
|
+
type = xmlpipe2
|
4
|
+
xmlpipe_command = bundle exec ./bin/sphinxcrawl spec/fixtures/index_file.html -d 2 2>/dev/null
|
5
|
+
}
|
6
|
+
|
7
|
+
index page
|
8
|
+
{
|
9
|
+
source = page
|
10
|
+
path = tmp/page
|
11
|
+
morphology = stem_en
|
12
|
+
charset_type = utf-8
|
13
|
+
html_strip = 1
|
14
|
+
}
|
@@ -0,0 +1,70 @@
|
|
1
|
+
Feature: Generation of sphinx xml
|
2
|
+
Scenario: help page
|
3
|
+
When I get help for "sphinxcrawl"
|
4
|
+
Then the exit status should be 0
|
5
|
+
And the banner should be present
|
6
|
+
And the banner should document that this app takes options
|
7
|
+
And the following options should be documented:
|
8
|
+
|--version|
|
9
|
+
|
10
|
+
Scenario: xml for a nonexistent file
|
11
|
+
When I run `sphinxcrawl spec/fixtures/blah.html`
|
12
|
+
Then the exit status should be 1
|
13
|
+
|
14
|
+
Scenario: xml for a nonexistent index
|
15
|
+
When I run `sphinxcrawl spec`
|
16
|
+
Then the exit status should be 1
|
17
|
+
|
18
|
+
Scenario: xml for a single file
|
19
|
+
Given a file named "test.html" with:
|
20
|
+
"""
|
21
|
+
<html>
|
22
|
+
<head>
|
23
|
+
</head>
|
24
|
+
<body>
|
25
|
+
<h1 data-field="name">My name</h1>
|
26
|
+
<p data-field="description">My description</p>
|
27
|
+
<p data-field="body"><span>My</span><span>body</span></p>
|
28
|
+
</body>
|
29
|
+
</html>
|
30
|
+
"""
|
31
|
+
When I run `sphinxcrawl test.html`
|
32
|
+
Then the exit status should be 0
|
33
|
+
And the output should contain "name"
|
34
|
+
And the output should contain "description"
|
35
|
+
And the output should contain "body"
|
36
|
+
|
37
|
+
Scenario: index the xml
|
38
|
+
Given a file named "test.html" with:
|
39
|
+
"""
|
40
|
+
<html>
|
41
|
+
<head>
|
42
|
+
</head>
|
43
|
+
<body>
|
44
|
+
<h1 data-field="name">My name</h1>
|
45
|
+
<p data-field="description">My description</p>
|
46
|
+
<p data-field="body"><span>My</span><span>body</span></p>
|
47
|
+
</body>
|
48
|
+
</html>
|
49
|
+
"""
|
50
|
+
And a file named "test.conf" with:
|
51
|
+
"""
|
52
|
+
indexer
|
53
|
+
{
|
54
|
+
}
|
55
|
+
source page
|
56
|
+
{
|
57
|
+
type = xmlpipe2
|
58
|
+
xmlpipe_command = ../../bin/sphinxcrawl test.html 2>/dev/null
|
59
|
+
}
|
60
|
+
index page
|
61
|
+
{
|
62
|
+
source = page
|
63
|
+
path = page
|
64
|
+
morphology = stem_en
|
65
|
+
charset_type = utf-8
|
66
|
+
html_strip = 1
|
67
|
+
}
|
68
|
+
"""
|
69
|
+
When I run `indexer -c test.conf page`
|
70
|
+
Then the exit status should be 0
|
@@ -0,0 +1 @@
|
|
1
|
+
# Put your step definitions here
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'aruba/cucumber'
|
2
|
+
require 'methadone/cucumber'
|
3
|
+
|
4
|
+
ENV['PATH'] = "#{File.expand_path(File.dirname(__FILE__) + '/../../bin')}#{File::PATH_SEPARATOR}#{ENV['PATH']}"
|
5
|
+
LIB_DIR = File.join(File.expand_path(File.dirname(__FILE__)),'..','..','lib')
|
6
|
+
|
7
|
+
Before do
|
8
|
+
# Using "announce" causes massive warnings on 1.9.2
|
9
|
+
@puts = true
|
10
|
+
@original_rubylib = ENV['RUBYLIB']
|
11
|
+
ENV['RUBYLIB'] = LIB_DIR + File::PATH_SEPARATOR + ENV['RUBYLIB'].to_s
|
12
|
+
end
|
13
|
+
|
14
|
+
After do
|
15
|
+
ENV['RUBYLIB'] = @original_rubylib
|
16
|
+
end
|
data/lib/sphinxcrawl.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Sphinxcrawl
|
4
|
+
class Crawler
|
5
|
+
attr_reader :depth
|
6
|
+
|
7
|
+
def initialize(depth=0)
|
8
|
+
@depth = depth
|
9
|
+
end
|
10
|
+
|
11
|
+
def pages
|
12
|
+
return @pages if @pages
|
13
|
+
return [] unless index
|
14
|
+
@pages = Set.new([index])
|
15
|
+
return @pages if @depth == 0
|
16
|
+
|
17
|
+
current_pages = Set.new([index])
|
18
|
+
depth.times do
|
19
|
+
links = current_pages.map(&:links).flatten.compact.uniq
|
20
|
+
current_pages = Set.new(links.map{ |url| get_page(url) }.compact) - @pages
|
21
|
+
@pages += current_pages
|
22
|
+
end
|
23
|
+
@pages
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def index
|
29
|
+
nil
|
30
|
+
end
|
31
|
+
|
32
|
+
def get_page(url)
|
33
|
+
nil
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Sphinxcrawl
|
2
|
+
class FileCrawler < Crawler
|
3
|
+
def initialize(index_file_name, depth=0)
|
4
|
+
@directory = File.dirname(index_file_name)
|
5
|
+
@basename = File.basename(index_file_name)
|
6
|
+
super(depth)
|
7
|
+
end
|
8
|
+
|
9
|
+
private
|
10
|
+
|
11
|
+
def index
|
12
|
+
@index ||= get_page(@basename)
|
13
|
+
end
|
14
|
+
|
15
|
+
def get_page(url)
|
16
|
+
html = File.read(@directory + '/' + url) rescue nil
|
17
|
+
Page.new(url, html) if html
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Sphinxcrawl
|
4
|
+
class Page
|
5
|
+
attr_reader :html, :url
|
6
|
+
|
7
|
+
def initialize(url, html)
|
8
|
+
@url = url
|
9
|
+
@html = html
|
10
|
+
@field_data = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def empty?
|
14
|
+
!html || html.empty?
|
15
|
+
end
|
16
|
+
|
17
|
+
def field_names
|
18
|
+
fields.map do |n|
|
19
|
+
n.attribute('data-field').value
|
20
|
+
end.uniq
|
21
|
+
end
|
22
|
+
|
23
|
+
def field(name)
|
24
|
+
@field_data[name] ||= get_field(name)
|
25
|
+
end
|
26
|
+
|
27
|
+
def links
|
28
|
+
@links ||= document.xpath('//a/@href').map(&:value).map do |url|
|
29
|
+
# only local links (no http://)
|
30
|
+
uri = URI.parse(url)
|
31
|
+
uri.host.nil? && uri.scheme.nil? ? url : nil
|
32
|
+
end.compact
|
33
|
+
end
|
34
|
+
|
35
|
+
def eql?(compare)
|
36
|
+
url == compare.url
|
37
|
+
end
|
38
|
+
alias :== :eql?
|
39
|
+
|
40
|
+
def hash
|
41
|
+
url.hash
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def get_field(name)
|
47
|
+
document.xpath("//*[@data-field='#{name}']//text()").map(&:content).join(' ')
|
48
|
+
end
|
49
|
+
|
50
|
+
def fields
|
51
|
+
@fields ||= document.xpath('//*[@data-field]')
|
52
|
+
end
|
53
|
+
|
54
|
+
def document
|
55
|
+
@document ||= Nokogiri::HTML(html)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
|
3
|
+
module Sphinxcrawl
|
4
|
+
class Stream
|
5
|
+
include REXML
|
6
|
+
|
7
|
+
attr_reader :pages
|
8
|
+
|
9
|
+
def initialize(pages)
|
10
|
+
@pages = Array(pages)
|
11
|
+
@count = 0
|
12
|
+
end
|
13
|
+
|
14
|
+
def empty?
|
15
|
+
pages.length == 0
|
16
|
+
end
|
17
|
+
|
18
|
+
def number_of_pages
|
19
|
+
pages.length
|
20
|
+
end
|
21
|
+
|
22
|
+
def field_names
|
23
|
+
@field_names ||= pages.map(&:field_names).flatten.uniq
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_xml
|
27
|
+
add_fields
|
28
|
+
add_documents
|
29
|
+
''.tap { |xml| document.write(xml) }
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def next_id
|
35
|
+
@count += 1
|
36
|
+
end
|
37
|
+
|
38
|
+
def add_fields
|
39
|
+
field_names.each do |name|
|
40
|
+
add_field(name)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def add_documents
|
45
|
+
pages.each do |page|
|
46
|
+
add_document(page)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def add_field(name)
|
51
|
+
schema.add_element('sphinx:field').tap do |field|
|
52
|
+
field.add_attribute('name', name)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def add_document(page)
|
57
|
+
if page.field_names.length > 0
|
58
|
+
root.add_element('sphinx:document').tap do |doc|
|
59
|
+
doc.add_attribute('id', next_id)
|
60
|
+
doc.add_element('url').text = CData.new(page.url)
|
61
|
+
field_names.each do |name|
|
62
|
+
doc.add_element(name).text = CData.new(page.field(name))
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def document
|
69
|
+
@document ||= Document.new.tap do |doc|
|
70
|
+
doc << XMLDecl.new('1.0', 'UTF-8')
|
71
|
+
@docset = doc.add_element('sphinx:docset')
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def root
|
76
|
+
document.root
|
77
|
+
end
|
78
|
+
|
79
|
+
def schema
|
80
|
+
@schema ||= root.add_element('sphinx:schema').tap do |sch|
|
81
|
+
sch.add_element('sphinx:attr').add_attributes('name' => 'url', 'type' => 'string')
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'net/http'
|
3
|
+
|
4
|
+
module Sphinxcrawl
|
5
|
+
class WebCrawler < Crawler
|
6
|
+
def initialize(url, depth=0)
|
7
|
+
@uri = URI.parse(url)
|
8
|
+
super(depth)
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def host
|
14
|
+
@uri.host
|
15
|
+
end
|
16
|
+
|
17
|
+
def port
|
18
|
+
@uri.port
|
19
|
+
end
|
20
|
+
|
21
|
+
def index_path
|
22
|
+
@uri.path
|
23
|
+
end
|
24
|
+
|
25
|
+
def http
|
26
|
+
Net::HTTP.new(host, port)
|
27
|
+
end
|
28
|
+
|
29
|
+
def index
|
30
|
+
@index ||= get_page(index_path)
|
31
|
+
end
|
32
|
+
|
33
|
+
def get_page(path)
|
34
|
+
path = path && !path.empty? ? path : '/'
|
35
|
+
location = path
|
36
|
+
begin
|
37
|
+
response = http.request(Net::HTTP::Get.new(location))
|
38
|
+
end while location = response.header['location']
|
39
|
+
html = response.body
|
40
|
+
Page.new(path, html) if html
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
</head>
|
4
|
+
<body>
|
5
|
+
<h1 data-field="name">My name is child</h1>
|
6
|
+
<p data-field="description">My description is child</p>
|
7
|
+
<p data-field="body"><span>My</span><span>body</span>is child</p>
|
8
|
+
<a href="index_file.html">index</a>
|
9
|
+
<a href="tree_file.html">tree</a>
|
10
|
+
</body>
|
11
|
+
</html>
|
@@ -0,0 +1,11 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
</head>
|
4
|
+
<body>
|
5
|
+
<h1 data-field="name">My name is child</h1>
|
6
|
+
<p data-field="description">My description is child</p>
|
7
|
+
<p data-field="body"><span>My</span><span>body</span>is child</p>
|
8
|
+
<a href="/index_web.html">index</a>
|
9
|
+
<a href="/tree_web.html">tree</a>
|
10
|
+
</body>
|
11
|
+
</html>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
</head>
|
4
|
+
<body>
|
5
|
+
<h1 data-field="name">My name</h1>
|
6
|
+
<p data-field="description">My description</p>
|
7
|
+
<p data-field="body"><span>My</span><span>body</span></p>
|
8
|
+
<a href="tree_file.html">tree</a>
|
9
|
+
<a href="index_file.html">index</a>
|
10
|
+
<a href="http://www.google.com">google</a>
|
11
|
+
</body>
|
12
|
+
</html>
|
@@ -0,0 +1,11 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
</head>
|
4
|
+
<body>
|
5
|
+
<h1 data-field="name">My name</h1>
|
6
|
+
<p data-field="description">My description</p>
|
7
|
+
<p data-field="body"><span>My</span><span>body</span></p>
|
8
|
+
<a href="/tree_web.html">tree</a>
|
9
|
+
<a href="/index_web.html">index</a>
|
10
|
+
</body>
|
11
|
+
</html>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
</head>
|
4
|
+
<body>
|
5
|
+
<h1 data-field="name">My name is tree</h1>
|
6
|
+
<p data-field="description">My description is tree</p>
|
7
|
+
<p data-field="body"><span>My</span><span>body</span>is tree</p>
|
8
|
+
<a href="index_file.html">index</a>
|
9
|
+
<a href="child_file.html">child</a>
|
10
|
+
<a href="tree_file.html">tree</a>
|
11
|
+
</body>
|
12
|
+
</html>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
</head>
|
4
|
+
<body>
|
5
|
+
<h1 data-field="name">My name is tree</h1>
|
6
|
+
<p data-field="description">My description is tree</p>
|
7
|
+
<p data-field="body"><span>My</span><span>body</span>is tree</p>
|
8
|
+
<a href="/index_web.html">index</a>
|
9
|
+
<a href="/child_web.html">child</a>
|
10
|
+
<a href="/tree_web.html">tree</a>
|
11
|
+
</body>
|
12
|
+
</html>
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Sphinxcrawl::FileCrawler do
|
4
|
+
let(:dir) { 'spec/fixtures' }
|
5
|
+
let(:index) { 'index_file.html' }
|
6
|
+
let(:html) { dir + '/' + index }
|
7
|
+
subject { Sphinxcrawl::FileCrawler.new(html) }
|
8
|
+
|
9
|
+
its(:depth) { should be == 0 }
|
10
|
+
specify { subject.pages.length.should be == 1 }
|
11
|
+
|
12
|
+
context "with a depth of 1" do
|
13
|
+
subject { Sphinxcrawl::FileCrawler.new(html, 1) }
|
14
|
+
its(:depth) { should be == 1 }
|
15
|
+
specify { subject.pages.length.should be == 2 }
|
16
|
+
end
|
17
|
+
|
18
|
+
context "with a depth of 2" do
|
19
|
+
subject { Sphinxcrawl::FileCrawler.new(html, 2) }
|
20
|
+
its(:depth) { should be == 2 }
|
21
|
+
specify { subject.pages.length.should be == 3 }
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Sphinxcrawl::Page do
|
4
|
+
let(:html) { File.read('spec/fixtures/index_file.html') }
|
5
|
+
let(:same) { Sphinxcrawl::Page.new(subject.url, 'blah') }
|
6
|
+
subject { Sphinxcrawl::Page.new('index_file.html', html) }
|
7
|
+
|
8
|
+
its(:html) { should be == html }
|
9
|
+
it { should_not be_empty }
|
10
|
+
|
11
|
+
its(:field_names) { should be == %w[name description body] }
|
12
|
+
specify { subject.field('name').should be == 'My name' }
|
13
|
+
specify { subject.field('description').should be == 'My description' }
|
14
|
+
specify { subject.field('body').should be == 'My body' }
|
15
|
+
|
16
|
+
its(:links) { should be == ['tree_file.html', 'index_file.html'] }
|
17
|
+
|
18
|
+
specify { subject.should be == same }
|
19
|
+
specify { subject.should eq(same) }
|
20
|
+
specify { subject.should eql(same) }
|
21
|
+
specify { subject.should_not equal same }
|
22
|
+
specify { [subject, same].uniq.length.should be == 1 }
|
23
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Sphinxcrawl::Stream do
|
4
|
+
let(:html) { File.read('spec/fixtures/index_file.html') }
|
5
|
+
let(:page) { Sphinxcrawl::Page.new('index_file.html', html) }
|
6
|
+
subject { Sphinxcrawl::Stream.new(page) }
|
7
|
+
let(:xml) { subject.to_xml }
|
8
|
+
|
9
|
+
it { should_not be_empty }
|
10
|
+
its(:number_of_pages) { should be == 1 }
|
11
|
+
|
12
|
+
its(:field_names) { should be == %w[name description body] }
|
13
|
+
|
14
|
+
its(:to_xml) { should include('name') }
|
15
|
+
its(:to_xml) { should include('description') }
|
16
|
+
its(:to_xml) { should include('body') }
|
17
|
+
its(:to_xml) { should include('My name') }
|
18
|
+
its(:to_xml) { should include('My description') }
|
19
|
+
its(:to_xml) { should include('My body') }
|
20
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Sphinxcrawl::WebCrawler do
|
4
|
+
let(:dir) { 'spec/fixtures' }
|
5
|
+
let(:domain) { 'http://www.example.com' }
|
6
|
+
let(:index) { 'index_web.html' }
|
7
|
+
let(:tree) { 'tree_web.html' }
|
8
|
+
let(:child) { 'child_web.html' }
|
9
|
+
|
10
|
+
before do
|
11
|
+
stub_request(:get, domain+'/'+index).to_return(:body => File.new(dir+'/'+index), :status => 200)
|
12
|
+
stub_request(:get, domain+'/'+tree).to_return(:body => File.new(dir+'/'+tree), :status => 200)
|
13
|
+
stub_request(:get, domain+'/'+child).to_return(:body => File.new(dir+'/'+child), :status => 200)
|
14
|
+
end
|
15
|
+
|
16
|
+
subject { Sphinxcrawl::WebCrawler.new(domain+'/'+index) }
|
17
|
+
|
18
|
+
its(:depth) { should be == 0 }
|
19
|
+
specify { subject.pages.length.should be == 1 }
|
20
|
+
|
21
|
+
context "with a depth of 1" do
|
22
|
+
subject { Sphinxcrawl::WebCrawler.new(domain+'/'+index, 1) }
|
23
|
+
its(:depth) { should be == 1 }
|
24
|
+
specify { subject.pages.length.should be == 2 }
|
25
|
+
end
|
26
|
+
|
27
|
+
context "with a depth of 2" do
|
28
|
+
subject { Sphinxcrawl::WebCrawler.new(domain+'/'+index, 2) }
|
29
|
+
its(:depth) { should be == 2 }
|
30
|
+
specify { subject.pages.length.should be == 3 }
|
31
|
+
end
|
32
|
+
end
|
data/sphinxcrawl.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'sphinxcrawl/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = 'sphinxcrawl'
|
8
|
+
gem.version = Sphinxcrawl::VERSION
|
9
|
+
gem.authors = ['Jon Doveston']
|
10
|
+
gem.email = ['jon@doveston.me.uk']
|
11
|
+
gem.description = %q{Simple command to crawl a site and process html into a sphinx xmlstream}
|
12
|
+
gem.summary = %q{Simple command to crawl a site and process html into s sphinx xmlstream}
|
13
|
+
gem.homepage = 'https://github.com/hatoishi/sphinxcrawl'
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ['lib']
|
19
|
+
gem.add_dependency('methadone', '~> 1.2.2')
|
20
|
+
gem.add_dependency('nokogiri', '~> 1.5.5')
|
21
|
+
|
22
|
+
gem.add_development_dependency('rdoc')
|
23
|
+
gem.add_development_dependency('aruba')
|
24
|
+
gem.add_development_dependency('rake')
|
25
|
+
gem.add_development_dependency('webmock')
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,194 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sphinxcrawl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jon Doveston
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-11-03 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: methadone
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.2.2
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.2.2
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: nokogiri
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 1.5.5
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 1.5.5
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rdoc
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: aruba
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: rake
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: webmock
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :development
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
description: Simple command to crawl a site and process html into a sphinx xmlstream
|
111
|
+
email:
|
112
|
+
- jon@doveston.me.uk
|
113
|
+
executables:
|
114
|
+
- sphinxcrawl
|
115
|
+
extensions: []
|
116
|
+
extra_rdoc_files: []
|
117
|
+
files:
|
118
|
+
- .gitignore
|
119
|
+
- Gemfile
|
120
|
+
- Gemfile.lock
|
121
|
+
- Guardfile
|
122
|
+
- LICENSE.txt
|
123
|
+
- README.markdown
|
124
|
+
- README.rdoc
|
125
|
+
- Rakefile
|
126
|
+
- bin/sphinxcrawl
|
127
|
+
- example.conf
|
128
|
+
- features/sphinxcrawl.feature
|
129
|
+
- features/step_definitions/sphinxcrawl_steps.rb
|
130
|
+
- features/support/env.rb
|
131
|
+
- lib/sphinxcrawl.rb
|
132
|
+
- lib/sphinxcrawl/crawler.rb
|
133
|
+
- lib/sphinxcrawl/file_crawler.rb
|
134
|
+
- lib/sphinxcrawl/page.rb
|
135
|
+
- lib/sphinxcrawl/stream.rb
|
136
|
+
- lib/sphinxcrawl/version.rb
|
137
|
+
- lib/sphinxcrawl/web_crawler.rb
|
138
|
+
- spec/fixtures/child_file.html
|
139
|
+
- spec/fixtures/child_web.html
|
140
|
+
- spec/fixtures/index_file.html
|
141
|
+
- spec/fixtures/index_web.html
|
142
|
+
- spec/fixtures/tree_file.html
|
143
|
+
- spec/fixtures/tree_web.html
|
144
|
+
- spec/spec_helper.rb
|
145
|
+
- spec/sphinxcrawl/file_crawler_spec.rb
|
146
|
+
- spec/sphinxcrawl/page_spec.rb
|
147
|
+
- spec/sphinxcrawl/stream_spec.rb
|
148
|
+
- spec/sphinxcrawl/web_crawler_spec.rb
|
149
|
+
- sphinxcrawl.gemspec
|
150
|
+
homepage: https://github.com/hatoishi/sphinxcrawl
|
151
|
+
licenses: []
|
152
|
+
post_install_message:
|
153
|
+
rdoc_options: []
|
154
|
+
require_paths:
|
155
|
+
- lib
|
156
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
157
|
+
none: false
|
158
|
+
requirements:
|
159
|
+
- - ! '>='
|
160
|
+
- !ruby/object:Gem::Version
|
161
|
+
version: '0'
|
162
|
+
segments:
|
163
|
+
- 0
|
164
|
+
hash: 3810366116509016959
|
165
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
166
|
+
none: false
|
167
|
+
requirements:
|
168
|
+
- - ! '>='
|
169
|
+
- !ruby/object:Gem::Version
|
170
|
+
version: '0'
|
171
|
+
segments:
|
172
|
+
- 0
|
173
|
+
hash: 3810366116509016959
|
174
|
+
requirements: []
|
175
|
+
rubyforge_project:
|
176
|
+
rubygems_version: 1.8.24
|
177
|
+
signing_key:
|
178
|
+
specification_version: 3
|
179
|
+
summary: Simple command to crawl a site and process html into s sphinx xmlstream
|
180
|
+
test_files:
|
181
|
+
- features/sphinxcrawl.feature
|
182
|
+
- features/step_definitions/sphinxcrawl_steps.rb
|
183
|
+
- features/support/env.rb
|
184
|
+
- spec/fixtures/child_file.html
|
185
|
+
- spec/fixtures/child_web.html
|
186
|
+
- spec/fixtures/index_file.html
|
187
|
+
- spec/fixtures/index_web.html
|
188
|
+
- spec/fixtures/tree_file.html
|
189
|
+
- spec/fixtures/tree_web.html
|
190
|
+
- spec/spec_helper.rb
|
191
|
+
- spec/sphinxcrawl/file_crawler_spec.rb
|
192
|
+
- spec/sphinxcrawl/page_spec.rb
|
193
|
+
- spec/sphinxcrawl/stream_spec.rb
|
194
|
+
- spec/sphinxcrawl/web_crawler_spec.rb
|