sphinxcrawl 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +7 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +89 -0
- data/Guardfile +16 -0
- data/LICENSE.txt +22 -0
- data/README.markdown +69 -0
- data/README.rdoc +13 -0
- data/Rakefile +66 -0
- data/bin/sphinxcrawl +35 -0
- data/example.conf +14 -0
- data/features/sphinxcrawl.feature +70 -0
- data/features/step_definitions/sphinxcrawl_steps.rb +1 -0
- data/features/support/env.rb +16 -0
- data/lib/sphinxcrawl.rb +9 -0
- data/lib/sphinxcrawl/crawler.rb +36 -0
- data/lib/sphinxcrawl/file_crawler.rb +20 -0
- data/lib/sphinxcrawl/page.rb +58 -0
- data/lib/sphinxcrawl/stream.rb +85 -0
- data/lib/sphinxcrawl/version.rb +3 -0
- data/lib/sphinxcrawl/web_crawler.rb +43 -0
- data/spec/fixtures/child_file.html +11 -0
- data/spec/fixtures/child_web.html +11 -0
- data/spec/fixtures/index_file.html +12 -0
- data/spec/fixtures/index_web.html +11 -0
- data/spec/fixtures/tree_file.html +12 -0
- data/spec/fixtures/tree_web.html +12 -0
- data/spec/spec_helper.rb +2 -0
- data/spec/sphinxcrawl/file_crawler_spec.rb +23 -0
- data/spec/sphinxcrawl/page_spec.rb +23 -0
- data/spec/sphinxcrawl/stream_spec.rb +20 -0
- data/spec/sphinxcrawl/web_crawler_spec.rb +32 -0
- data/sphinxcrawl.gemspec +26 -0
- metadata +194 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in sphinxcrawl.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
gem 'guard'
|
7
|
+
gem 'guard-rspec'
|
8
|
+
gem 'guard-ctags-bundler'
|
9
|
+
gem 'guard-cucumber'
|
10
|
+
|
11
|
+
group :darwin do
|
12
|
+
gem 'rb-fsevent'
|
13
|
+
end
|
14
|
+
|
15
|
+
group :linux do
|
16
|
+
gem 'rb-inotify'
|
17
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
sphinxcrawl (0.0.1)
|
5
|
+
methadone (~> 1.2.2)
|
6
|
+
nokogiri (~> 1.5.5)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
addressable (2.3.2)
|
12
|
+
aruba (0.5.0)
|
13
|
+
childprocess (= 0.2.3)
|
14
|
+
cucumber (>= 1.1.1)
|
15
|
+
ffi (>= 1.0.11)
|
16
|
+
rspec-expectations (>= 2.7.0)
|
17
|
+
builder (3.1.4)
|
18
|
+
childprocess (0.2.3)
|
19
|
+
ffi (~> 1.0.6)
|
20
|
+
coderay (1.0.8)
|
21
|
+
crack (0.3.1)
|
22
|
+
cucumber (1.2.1)
|
23
|
+
builder (>= 2.1.2)
|
24
|
+
diff-lcs (>= 1.1.3)
|
25
|
+
gherkin (~> 2.11.0)
|
26
|
+
json (>= 1.4.6)
|
27
|
+
diff-lcs (1.1.3)
|
28
|
+
ffi (1.0.11)
|
29
|
+
gherkin (2.11.5)
|
30
|
+
json (>= 1.4.6)
|
31
|
+
guard (1.5.3)
|
32
|
+
listen (>= 0.4.2)
|
33
|
+
lumberjack (>= 1.0.2)
|
34
|
+
pry (>= 0.9.10)
|
35
|
+
thor (>= 0.14.6)
|
36
|
+
guard-ctags-bundler (0.1.6)
|
37
|
+
guard (>= 1.1)
|
38
|
+
guard-cucumber (1.2.2)
|
39
|
+
cucumber (>= 1.2.0)
|
40
|
+
guard (>= 1.1.0)
|
41
|
+
guard-rspec (2.1.1)
|
42
|
+
guard (>= 1.1)
|
43
|
+
rspec (~> 2.11)
|
44
|
+
json (1.7.5)
|
45
|
+
listen (0.5.3)
|
46
|
+
lumberjack (1.0.2)
|
47
|
+
methadone (1.2.2)
|
48
|
+
bundler
|
49
|
+
method_source (0.8.1)
|
50
|
+
nokogiri (1.5.5)
|
51
|
+
pry (0.9.10)
|
52
|
+
coderay (~> 1.0.5)
|
53
|
+
method_source (~> 0.8)
|
54
|
+
slop (~> 3.3.1)
|
55
|
+
rake (0.9.2.2)
|
56
|
+
rb-fsevent (0.9.2)
|
57
|
+
rb-inotify (0.8.8)
|
58
|
+
ffi (>= 0.5.0)
|
59
|
+
rdoc (3.12)
|
60
|
+
json (~> 1.4)
|
61
|
+
rspec (2.11.0)
|
62
|
+
rspec-core (~> 2.11.0)
|
63
|
+
rspec-expectations (~> 2.11.0)
|
64
|
+
rspec-mocks (~> 2.11.0)
|
65
|
+
rspec-core (2.11.1)
|
66
|
+
rspec-expectations (2.11.3)
|
67
|
+
diff-lcs (~> 1.1.3)
|
68
|
+
rspec-mocks (2.11.3)
|
69
|
+
slop (3.3.3)
|
70
|
+
thor (0.16.0)
|
71
|
+
webmock (1.8.10)
|
72
|
+
addressable (>= 2.2.7)
|
73
|
+
crack (>= 0.1.7)
|
74
|
+
|
75
|
+
PLATFORMS
|
76
|
+
ruby
|
77
|
+
|
78
|
+
DEPENDENCIES
|
79
|
+
aruba
|
80
|
+
guard
|
81
|
+
guard-ctags-bundler
|
82
|
+
guard-cucumber
|
83
|
+
guard-rspec
|
84
|
+
rake
|
85
|
+
rb-fsevent
|
86
|
+
rb-inotify
|
87
|
+
rdoc
|
88
|
+
sphinxcrawl!
|
89
|
+
webmock
|
data/Guardfile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
guard 'rspec' do
|
2
|
+
watch(%r{^spec/.+_spec\.rb$})
|
3
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
4
|
+
watch('spec/spec_helper.rb') { 'spec' }
|
5
|
+
end
|
6
|
+
|
7
|
+
guard 'ctags-bundler', :src_path => ['lib', 'spec'] do
|
8
|
+
watch(/^(lib)\/.*\.rb$/)
|
9
|
+
watch('Gemfile.lock')
|
10
|
+
end
|
11
|
+
|
12
|
+
guard 'cucumber' do
|
13
|
+
watch(%r{^features/.+\.feature$})
|
14
|
+
watch(%r{^features/support/.+$}) { 'features' }
|
15
|
+
watch(%r{^features/step_definitions/(.+)_steps\.rb$}) { |m| Dir[File.join("**/#{m[1]}.feature")][0] || 'features' }
|
16
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Jon Doveston
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
# Sphinxcrawl
|
2
|
+
|
3
|
+
Simple command line tool gem to crawl a website and generate an xml stream for a
|
4
|
+
sphinx search index.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Install it with:
|
9
|
+
|
10
|
+
$ gem install sphinxcrawl
|
11
|
+
|
12
|
+
## Usage
|
13
|
+
|
14
|
+
After installation the sphinxcrawl command should be available.
|
15
|
+
|
16
|
+
Create a sphinx.conf file, for example
|
17
|
+
|
18
|
+
|
19
|
+
source page
|
20
|
+
{
|
21
|
+
type = xmlpipe2
|
22
|
+
xmlpipe_command = sphinxcrawl http://www.example.com -d 2 2>/dev/null
|
23
|
+
}
|
24
|
+
|
25
|
+
index page
|
26
|
+
{
|
27
|
+
source = page
|
28
|
+
path = sphinx/page
|
29
|
+
morphology = stem_en
|
30
|
+
charset_type = utf-8
|
31
|
+
html_strip = 1
|
32
|
+
}
|
33
|
+
|
34
|
+
Install sphinx, with debianoid linuxes this is
|
35
|
+
|
36
|
+
sudo apt-get install sphinxsearch
|
37
|
+
|
38
|
+
You should now have the indexer and search commands
|
39
|
+
|
40
|
+
indexer -c sphinx.conf page
|
41
|
+
|
42
|
+
and you can search in the index with
|
43
|
+
|
44
|
+
search -c sphinx.conf name
|
45
|
+
|
46
|
+
## Requirements
|
47
|
+
|
48
|
+
This gem is not Google! It uses nokogiri to parse html so will fail on badly
|
49
|
+
formed html. Also more importantly it only indexes part of the page marked with
|
50
|
+
a specific html attribute. This means you can only index sites that you control.
|
51
|
+
For example
|
52
|
+
|
53
|
+
...
|
54
|
+
<div data-field="description">
|
55
|
+
<p>This is a description</p>
|
56
|
+
</div>
|
57
|
+
...
|
58
|
+
|
59
|
+
will index the text inside the div tag (stripping other tags) and put the text
|
60
|
+
in a sphinx field called description. All fields are aggregated from all the
|
61
|
+
pages from a site crawl.
|
62
|
+
|
63
|
+
## Contributing
|
64
|
+
|
65
|
+
1. Fork it
|
66
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
67
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
68
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
69
|
+
5. Create new Pull Request
|
data/README.rdoc
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
= sphinxcrawl - html crawler and sphinx xmlstream generator
|
2
|
+
|
3
|
+
Author:: Jon Doveston (jon@doveston.me.uk)
|
4
|
+
Copyright:: Copyright (c) 2012 Jon Doveston
|
5
|
+
|
6
|
+
|
7
|
+
A command line gem that can crawl html (files or http) and generate an xml
|
8
|
+
stream for sphinx search.
|
9
|
+
|
10
|
+
== Links
|
11
|
+
|
12
|
+
* {Source on Github}[https://github.com/hatoishi/sphinxcrawl]
|
13
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
def dump_load_path
|
2
|
+
puts $LOAD_PATH.join("\n")
|
3
|
+
found = nil
|
4
|
+
$LOAD_PATH.each do |path|
|
5
|
+
if File.exists?(File.join(path,"rspec"))
|
6
|
+
puts "Found rspec in #{path}"
|
7
|
+
if File.exists?(File.join(path,"rspec","core"))
|
8
|
+
puts "Found core"
|
9
|
+
if File.exists?(File.join(path,"rspec","core","rake_task"))
|
10
|
+
puts "Found rake_task"
|
11
|
+
found = path
|
12
|
+
else
|
13
|
+
puts "!! no rake_task"
|
14
|
+
end
|
15
|
+
else
|
16
|
+
puts "!!! no core"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
if found.nil?
|
21
|
+
puts "Didn't find rspec/core/rake_task anywhere"
|
22
|
+
else
|
23
|
+
puts "Found in #{path}"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
require 'bundler'
|
27
|
+
require 'rake/clean'
|
28
|
+
|
29
|
+
begin
|
30
|
+
require 'rspec/core/rake_task'
|
31
|
+
rescue LoadError
|
32
|
+
dump_load_path
|
33
|
+
raise
|
34
|
+
end
|
35
|
+
|
36
|
+
require 'cucumber'
|
37
|
+
require 'cucumber/rake/task'
|
38
|
+
gem 'rdoc' # we need the installed RDoc gem, not the system one
|
39
|
+
require 'rdoc/task'
|
40
|
+
|
41
|
+
include Rake::DSL
|
42
|
+
|
43
|
+
Bundler::GemHelper.install_tasks
|
44
|
+
|
45
|
+
|
46
|
+
RSpec::Core::RakeTask.new do |t|
|
47
|
+
# Put spec opts in a file named .rspec in root
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
CUKE_RESULTS = 'results.html'
|
52
|
+
CLEAN << CUKE_RESULTS
|
53
|
+
Cucumber::Rake::Task.new(:features) do |t|
|
54
|
+
t.cucumber_opts = "features --format html -o #{CUKE_RESULTS} --format pretty --no-source -x"
|
55
|
+
t.fork = false
|
56
|
+
end
|
57
|
+
|
58
|
+
Rake::RDocTask.new do |rd|
|
59
|
+
|
60
|
+
rd.main = "README.rdoc"
|
61
|
+
|
62
|
+
rd.rdoc_files.include("README.rdoc","lib/**/*.rb","bin/**/*")
|
63
|
+
end
|
64
|
+
|
65
|
+
task :default => [:spec,:features]
|
66
|
+
|
data/bin/sphinxcrawl
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'methadone'
|
5
|
+
require 'sphinxcrawl'
|
6
|
+
|
7
|
+
class App
|
8
|
+
include Methadone::Main
|
9
|
+
include Methadone::CLILogging
|
10
|
+
|
11
|
+
main do |index|
|
12
|
+
index ||= 'index.html'
|
13
|
+
if index =~ /http:/
|
14
|
+
crawler = Sphinxcrawl::WebCrawler.new(index, options[:depth].to_i)
|
15
|
+
else
|
16
|
+
exit_now!(1, "file #{index} not found") unless File.exists?(index)
|
17
|
+
index = index+'/index.html' if File.directory?(index)
|
18
|
+
exit_now!(1, "file #{index} not found") unless File.exists?(index)
|
19
|
+
crawler = Sphinxcrawl::FileCrawler.new(index, options[:depth].to_i)
|
20
|
+
end
|
21
|
+
pages = crawler.pages
|
22
|
+
stream = Sphinxcrawl::Stream.new(pages)
|
23
|
+
puts stream.to_xml
|
24
|
+
end
|
25
|
+
|
26
|
+
description 'Generate sphinx xml stream from html pages'
|
27
|
+
arg :index, :optional
|
28
|
+
|
29
|
+
options['depth'] = '0'
|
30
|
+
on('-d depth', '--depth', 'Crawl depth', /^\d+$/)
|
31
|
+
|
32
|
+
version Sphinxcrawl::VERSION
|
33
|
+
use_log_level_option
|
34
|
+
go!
|
35
|
+
end
|
data/example.conf
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
source page
|
2
|
+
{
|
3
|
+
type = xmlpipe2
|
4
|
+
xmlpipe_command = bundle exec ./bin/sphinxcrawl spec/fixtures/index_file.html -d 2 2>/dev/null
|
5
|
+
}
|
6
|
+
|
7
|
+
index page
|
8
|
+
{
|
9
|
+
source = page
|
10
|
+
path = tmp/page
|
11
|
+
morphology = stem_en
|
12
|
+
charset_type = utf-8
|
13
|
+
html_strip = 1
|
14
|
+
}
|
@@ -0,0 +1,70 @@
|
|
1
|
+
Feature: Generation of sphinx xml
|
2
|
+
Scenario: help page
|
3
|
+
When I get help for "sphinxcrawl"
|
4
|
+
Then the exit status should be 0
|
5
|
+
And the banner should be present
|
6
|
+
And the banner should document that this app takes options
|
7
|
+
And the following options should be documented:
|
8
|
+
|--version|
|
9
|
+
|
10
|
+
Scenario: xml for a nonexistent file
|
11
|
+
When I run `sphinxcrawl spec/fixtures/blah.html`
|
12
|
+
Then the exit status should be 1
|
13
|
+
|
14
|
+
Scenario: xml for a nonexistent index
|
15
|
+
When I run `sphinxcrawl spec`
|
16
|
+
Then the exit status should be 1
|
17
|
+
|
18
|
+
Scenario: xml for a single file
|
19
|
+
Given a file named "test.html" with:
|
20
|
+
"""
|
21
|
+
<html>
|
22
|
+
<head>
|
23
|
+
</head>
|
24
|
+
<body>
|
25
|
+
<h1 data-field="name">My name</h1>
|
26
|
+
<p data-field="description">My description</p>
|
27
|
+
<p data-field="body"><span>My</span><span>body</span></p>
|
28
|
+
</body>
|
29
|
+
</html>
|
30
|
+
"""
|
31
|
+
When I run `sphinxcrawl test.html`
|
32
|
+
Then the exit status should be 0
|
33
|
+
And the output should contain "name"
|
34
|
+
And the output should contain "description"
|
35
|
+
And the output should contain "body"
|
36
|
+
|
37
|
+
Scenario: index the xml
|
38
|
+
Given a file named "test.html" with:
|
39
|
+
"""
|
40
|
+
<html>
|
41
|
+
<head>
|
42
|
+
</head>
|
43
|
+
<body>
|
44
|
+
<h1 data-field="name">My name</h1>
|
45
|
+
<p data-field="description">My description</p>
|
46
|
+
<p data-field="body"><span>My</span><span>body</span></p>
|
47
|
+
</body>
|
48
|
+
</html>
|
49
|
+
"""
|
50
|
+
And a file named "test.conf" with:
|
51
|
+
"""
|
52
|
+
indexer
|
53
|
+
{
|
54
|
+
}
|
55
|
+
source page
|
56
|
+
{
|
57
|
+
type = xmlpipe2
|
58
|
+
xmlpipe_command = ../../bin/sphinxcrawl test.html 2>/dev/null
|
59
|
+
}
|
60
|
+
index page
|
61
|
+
{
|
62
|
+
source = page
|
63
|
+
path = page
|
64
|
+
morphology = stem_en
|
65
|
+
charset_type = utf-8
|
66
|
+
html_strip = 1
|
67
|
+
}
|
68
|
+
"""
|
69
|
+
When I run `indexer -c test.conf page`
|
70
|
+
Then the exit status should be 0
|
@@ -0,0 +1 @@
|
|
1
|
+
# Put your step definitions here
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'aruba/cucumber'
|
2
|
+
require 'methadone/cucumber'
|
3
|
+
|
4
|
+
ENV['PATH'] = "#{File.expand_path(File.dirname(__FILE__) + '/../../bin')}#{File::PATH_SEPARATOR}#{ENV['PATH']}"
|
5
|
+
LIB_DIR = File.join(File.expand_path(File.dirname(__FILE__)),'..','..','lib')
|
6
|
+
|
7
|
+
Before do
|
8
|
+
# Using "announce" causes massive warnings on 1.9.2
|
9
|
+
@puts = true
|
10
|
+
@original_rubylib = ENV['RUBYLIB']
|
11
|
+
ENV['RUBYLIB'] = LIB_DIR + File::PATH_SEPARATOR + ENV['RUBYLIB'].to_s
|
12
|
+
end
|
13
|
+
|
14
|
+
After do
|
15
|
+
ENV['RUBYLIB'] = @original_rubylib
|
16
|
+
end
|
data/lib/sphinxcrawl.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Sphinxcrawl
|
4
|
+
class Crawler
|
5
|
+
attr_reader :depth
|
6
|
+
|
7
|
+
def initialize(depth=0)
|
8
|
+
@depth = depth
|
9
|
+
end
|
10
|
+
|
11
|
+
def pages
|
12
|
+
return @pages if @pages
|
13
|
+
return [] unless index
|
14
|
+
@pages = Set.new([index])
|
15
|
+
return @pages if @depth == 0
|
16
|
+
|
17
|
+
current_pages = Set.new([index])
|
18
|
+
depth.times do
|
19
|
+
links = current_pages.map(&:links).flatten.compact.uniq
|
20
|
+
current_pages = Set.new(links.map{ |url| get_page(url) }.compact) - @pages
|
21
|
+
@pages += current_pages
|
22
|
+
end
|
23
|
+
@pages
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def index
|
29
|
+
nil
|
30
|
+
end
|
31
|
+
|
32
|
+
def get_page(url)
|
33
|
+
nil
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Sphinxcrawl
|
2
|
+
class FileCrawler < Crawler
|
3
|
+
def initialize(index_file_name, depth=0)
|
4
|
+
@directory = File.dirname(index_file_name)
|
5
|
+
@basename = File.basename(index_file_name)
|
6
|
+
super(depth)
|
7
|
+
end
|
8
|
+
|
9
|
+
private
|
10
|
+
|
11
|
+
def index
|
12
|
+
@index ||= get_page(@basename)
|
13
|
+
end
|
14
|
+
|
15
|
+
def get_page(url)
|
16
|
+
html = File.read(@directory + '/' + url) rescue nil
|
17
|
+
Page.new(url, html) if html
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Sphinxcrawl
|
4
|
+
class Page
|
5
|
+
attr_reader :html, :url
|
6
|
+
|
7
|
+
def initialize(url, html)
|
8
|
+
@url = url
|
9
|
+
@html = html
|
10
|
+
@field_data = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def empty?
|
14
|
+
!html || html.empty?
|
15
|
+
end
|
16
|
+
|
17
|
+
def field_names
|
18
|
+
fields.map do |n|
|
19
|
+
n.attribute('data-field').value
|
20
|
+
end.uniq
|
21
|
+
end
|
22
|
+
|
23
|
+
def field(name)
|
24
|
+
@field_data[name] ||= get_field(name)
|
25
|
+
end
|
26
|
+
|
27
|
+
def links
|
28
|
+
@links ||= document.xpath('//a/@href').map(&:value).map do |url|
|
29
|
+
# only local links (no http://)
|
30
|
+
uri = URI.parse(url)
|
31
|
+
uri.host.nil? && uri.scheme.nil? ? url : nil
|
32
|
+
end.compact
|
33
|
+
end
|
34
|
+
|
35
|
+
def eql?(compare)
|
36
|
+
url == compare.url
|
37
|
+
end
|
38
|
+
alias :== :eql?
|
39
|
+
|
40
|
+
def hash
|
41
|
+
url.hash
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def get_field(name)
|
47
|
+
document.xpath("//*[@data-field='#{name}']//text()").map(&:content).join(' ')
|
48
|
+
end
|
49
|
+
|
50
|
+
def fields
|
51
|
+
@fields ||= document.xpath('//*[@data-field]')
|
52
|
+
end
|
53
|
+
|
54
|
+
def document
|
55
|
+
@document ||= Nokogiri::HTML(html)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
|
3
|
+
module Sphinxcrawl
|
4
|
+
class Stream
|
5
|
+
include REXML
|
6
|
+
|
7
|
+
attr_reader :pages
|
8
|
+
|
9
|
+
def initialize(pages)
|
10
|
+
@pages = Array(pages)
|
11
|
+
@count = 0
|
12
|
+
end
|
13
|
+
|
14
|
+
def empty?
|
15
|
+
pages.length == 0
|
16
|
+
end
|
17
|
+
|
18
|
+
def number_of_pages
|
19
|
+
pages.length
|
20
|
+
end
|
21
|
+
|
22
|
+
def field_names
|
23
|
+
@field_names ||= pages.map(&:field_names).flatten.uniq
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_xml
|
27
|
+
add_fields
|
28
|
+
add_documents
|
29
|
+
''.tap { |xml| document.write(xml) }
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def next_id
|
35
|
+
@count += 1
|
36
|
+
end
|
37
|
+
|
38
|
+
def add_fields
|
39
|
+
field_names.each do |name|
|
40
|
+
add_field(name)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def add_documents
|
45
|
+
pages.each do |page|
|
46
|
+
add_document(page)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def add_field(name)
|
51
|
+
schema.add_element('sphinx:field').tap do |field|
|
52
|
+
field.add_attribute('name', name)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def add_document(page)
|
57
|
+
if page.field_names.length > 0
|
58
|
+
root.add_element('sphinx:document').tap do |doc|
|
59
|
+
doc.add_attribute('id', next_id)
|
60
|
+
doc.add_element('url').text = CData.new(page.url)
|
61
|
+
field_names.each do |name|
|
62
|
+
doc.add_element(name).text = CData.new(page.field(name))
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def document
|
69
|
+
@document ||= Document.new.tap do |doc|
|
70
|
+
doc << XMLDecl.new('1.0', 'UTF-8')
|
71
|
+
@docset = doc.add_element('sphinx:docset')
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def root
|
76
|
+
document.root
|
77
|
+
end
|
78
|
+
|
79
|
+
def schema
|
80
|
+
@schema ||= root.add_element('sphinx:schema').tap do |sch|
|
81
|
+
sch.add_element('sphinx:attr').add_attributes('name' => 'url', 'type' => 'string')
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'net/http'
|
3
|
+
|
4
|
+
module Sphinxcrawl
|
5
|
+
class WebCrawler < Crawler
|
6
|
+
def initialize(url, depth=0)
|
7
|
+
@uri = URI.parse(url)
|
8
|
+
super(depth)
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def host
|
14
|
+
@uri.host
|
15
|
+
end
|
16
|
+
|
17
|
+
def port
|
18
|
+
@uri.port
|
19
|
+
end
|
20
|
+
|
21
|
+
def index_path
|
22
|
+
@uri.path
|
23
|
+
end
|
24
|
+
|
25
|
+
def http
|
26
|
+
Net::HTTP.new(host, port)
|
27
|
+
end
|
28
|
+
|
29
|
+
def index
|
30
|
+
@index ||= get_page(index_path)
|
31
|
+
end
|
32
|
+
|
33
|
+
def get_page(path)
|
34
|
+
path = path && !path.empty? ? path : '/'
|
35
|
+
location = path
|
36
|
+
begin
|
37
|
+
response = http.request(Net::HTTP::Get.new(location))
|
38
|
+
end while location = response.header['location']
|
39
|
+
html = response.body
|
40
|
+
Page.new(path, html) if html
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
</head>
|
4
|
+
<body>
|
5
|
+
<h1 data-field="name">My name is child</h1>
|
6
|
+
<p data-field="description">My description is child</p>
|
7
|
+
<p data-field="body"><span>My</span><span>body</span>is child</p>
|
8
|
+
<a href="index_file.html">index</a>
|
9
|
+
<a href="tree_file.html">tree</a>
|
10
|
+
</body>
|
11
|
+
</html>
|
@@ -0,0 +1,11 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
</head>
|
4
|
+
<body>
|
5
|
+
<h1 data-field="name">My name is child</h1>
|
6
|
+
<p data-field="description">My description is child</p>
|
7
|
+
<p data-field="body"><span>My</span><span>body</span>is child</p>
|
8
|
+
<a href="/index_web.html">index</a>
|
9
|
+
<a href="/tree_web.html">tree</a>
|
10
|
+
</body>
|
11
|
+
</html>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
</head>
|
4
|
+
<body>
|
5
|
+
<h1 data-field="name">My name</h1>
|
6
|
+
<p data-field="description">My description</p>
|
7
|
+
<p data-field="body"><span>My</span><span>body</span></p>
|
8
|
+
<a href="tree_file.html">tree</a>
|
9
|
+
<a href="index_file.html">index</a>
|
10
|
+
<a href="http://www.google.com">google</a>
|
11
|
+
</body>
|
12
|
+
</html>
|
@@ -0,0 +1,11 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
</head>
|
4
|
+
<body>
|
5
|
+
<h1 data-field="name">My name</h1>
|
6
|
+
<p data-field="description">My description</p>
|
7
|
+
<p data-field="body"><span>My</span><span>body</span></p>
|
8
|
+
<a href="/tree_web.html">tree</a>
|
9
|
+
<a href="/index_web.html">index</a>
|
10
|
+
</body>
|
11
|
+
</html>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
</head>
|
4
|
+
<body>
|
5
|
+
<h1 data-field="name">My name is tree</h1>
|
6
|
+
<p data-field="description">My description is tree</p>
|
7
|
+
<p data-field="body"><span>My</span><span>body</span>is tree</p>
|
8
|
+
<a href="index_file.html">index</a>
|
9
|
+
<a href="child_file.html">child</a>
|
10
|
+
<a href="tree_file.html">tree</a>
|
11
|
+
</body>
|
12
|
+
</html>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
</head>
|
4
|
+
<body>
|
5
|
+
<h1 data-field="name">My name is tree</h1>
|
6
|
+
<p data-field="description">My description is tree</p>
|
7
|
+
<p data-field="body"><span>My</span><span>body</span>is tree</p>
|
8
|
+
<a href="/index_web.html">index</a>
|
9
|
+
<a href="/child_web.html">child</a>
|
10
|
+
<a href="/tree_web.html">tree</a>
|
11
|
+
</body>
|
12
|
+
</html>
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Sphinxcrawl::FileCrawler do
|
4
|
+
let(:dir) { 'spec/fixtures' }
|
5
|
+
let(:index) { 'index_file.html' }
|
6
|
+
let(:html) { dir + '/' + index }
|
7
|
+
subject { Sphinxcrawl::FileCrawler.new(html) }
|
8
|
+
|
9
|
+
its(:depth) { should be == 0 }
|
10
|
+
specify { subject.pages.length.should be == 1 }
|
11
|
+
|
12
|
+
context "with a depth of 1" do
|
13
|
+
subject { Sphinxcrawl::FileCrawler.new(html, 1) }
|
14
|
+
its(:depth) { should be == 1 }
|
15
|
+
specify { subject.pages.length.should be == 2 }
|
16
|
+
end
|
17
|
+
|
18
|
+
context "with a depth of 2" do
|
19
|
+
subject { Sphinxcrawl::FileCrawler.new(html, 2) }
|
20
|
+
its(:depth) { should be == 2 }
|
21
|
+
specify { subject.pages.length.should be == 3 }
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Sphinxcrawl::Page do
|
4
|
+
let(:html) { File.read('spec/fixtures/index_file.html') }
|
5
|
+
let(:same) { Sphinxcrawl::Page.new(subject.url, 'blah') }
|
6
|
+
subject { Sphinxcrawl::Page.new('index_file.html', html) }
|
7
|
+
|
8
|
+
its(:html) { should be == html }
|
9
|
+
it { should_not be_empty }
|
10
|
+
|
11
|
+
its(:field_names) { should be == %w[name description body] }
|
12
|
+
specify { subject.field('name').should be == 'My name' }
|
13
|
+
specify { subject.field('description').should be == 'My description' }
|
14
|
+
specify { subject.field('body').should be == 'My body' }
|
15
|
+
|
16
|
+
its(:links) { should be == ['tree_file.html', 'index_file.html'] }
|
17
|
+
|
18
|
+
specify { subject.should be == same }
|
19
|
+
specify { subject.should eq(same) }
|
20
|
+
specify { subject.should eql(same) }
|
21
|
+
specify { subject.should_not equal same }
|
22
|
+
specify { [subject, same].uniq.length.should be == 1 }
|
23
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Sphinxcrawl::Stream do
|
4
|
+
let(:html) { File.read('spec/fixtures/index_file.html') }
|
5
|
+
let(:page) { Sphinxcrawl::Page.new('index_file.html', html) }
|
6
|
+
subject { Sphinxcrawl::Stream.new(page) }
|
7
|
+
let(:xml) { subject.to_xml }
|
8
|
+
|
9
|
+
it { should_not be_empty }
|
10
|
+
its(:number_of_pages) { should be == 1 }
|
11
|
+
|
12
|
+
its(:field_names) { should be == %w[name description body] }
|
13
|
+
|
14
|
+
its(:to_xml) { should include('name') }
|
15
|
+
its(:to_xml) { should include('description') }
|
16
|
+
its(:to_xml) { should include('body') }
|
17
|
+
its(:to_xml) { should include('My name') }
|
18
|
+
its(:to_xml) { should include('My description') }
|
19
|
+
its(:to_xml) { should include('My body') }
|
20
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Sphinxcrawl::WebCrawler do
|
4
|
+
let(:dir) { 'spec/fixtures' }
|
5
|
+
let(:domain) { 'http://www.example.com' }
|
6
|
+
let(:index) { 'index_web.html' }
|
7
|
+
let(:tree) { 'tree_web.html' }
|
8
|
+
let(:child) { 'child_web.html' }
|
9
|
+
|
10
|
+
before do
|
11
|
+
stub_request(:get, domain+'/'+index).to_return(:body => File.new(dir+'/'+index), :status => 200)
|
12
|
+
stub_request(:get, domain+'/'+tree).to_return(:body => File.new(dir+'/'+tree), :status => 200)
|
13
|
+
stub_request(:get, domain+'/'+child).to_return(:body => File.new(dir+'/'+child), :status => 200)
|
14
|
+
end
|
15
|
+
|
16
|
+
subject { Sphinxcrawl::WebCrawler.new(domain+'/'+index) }
|
17
|
+
|
18
|
+
its(:depth) { should be == 0 }
|
19
|
+
specify { subject.pages.length.should be == 1 }
|
20
|
+
|
21
|
+
context "with a depth of 1" do
|
22
|
+
subject { Sphinxcrawl::WebCrawler.new(domain+'/'+index, 1) }
|
23
|
+
its(:depth) { should be == 1 }
|
24
|
+
specify { subject.pages.length.should be == 2 }
|
25
|
+
end
|
26
|
+
|
27
|
+
context "with a depth of 2" do
|
28
|
+
subject { Sphinxcrawl::WebCrawler.new(domain+'/'+index, 2) }
|
29
|
+
its(:depth) { should be == 2 }
|
30
|
+
specify { subject.pages.length.should be == 3 }
|
31
|
+
end
|
32
|
+
end
|
data/sphinxcrawl.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'sphinxcrawl/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = 'sphinxcrawl'
|
8
|
+
gem.version = Sphinxcrawl::VERSION
|
9
|
+
gem.authors = ['Jon Doveston']
|
10
|
+
gem.email = ['jon@doveston.me.uk']
|
11
|
+
gem.description = %q{Simple command to crawl a site and process html into a sphinx xmlstream}
|
12
|
+
gem.summary = %q{Simple command to crawl a site and process html into s sphinx xmlstream}
|
13
|
+
gem.homepage = 'https://github.com/hatoishi/sphinxcrawl'
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ['lib']
|
19
|
+
gem.add_dependency('methadone', '~> 1.2.2')
|
20
|
+
gem.add_dependency('nokogiri', '~> 1.5.5')
|
21
|
+
|
22
|
+
gem.add_development_dependency('rdoc')
|
23
|
+
gem.add_development_dependency('aruba')
|
24
|
+
gem.add_development_dependency('rake')
|
25
|
+
gem.add_development_dependency('webmock')
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,194 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sphinxcrawl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Jon Doveston
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-11-03 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: methadone
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.2.2
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.2.2
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: nokogiri
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 1.5.5
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 1.5.5
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rdoc
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: aruba
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: rake
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: webmock
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :development
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
description: Simple command to crawl a site and process html into a sphinx xmlstream
|
111
|
+
email:
|
112
|
+
- jon@doveston.me.uk
|
113
|
+
executables:
|
114
|
+
- sphinxcrawl
|
115
|
+
extensions: []
|
116
|
+
extra_rdoc_files: []
|
117
|
+
files:
|
118
|
+
- .gitignore
|
119
|
+
- Gemfile
|
120
|
+
- Gemfile.lock
|
121
|
+
- Guardfile
|
122
|
+
- LICENSE.txt
|
123
|
+
- README.markdown
|
124
|
+
- README.rdoc
|
125
|
+
- Rakefile
|
126
|
+
- bin/sphinxcrawl
|
127
|
+
- example.conf
|
128
|
+
- features/sphinxcrawl.feature
|
129
|
+
- features/step_definitions/sphinxcrawl_steps.rb
|
130
|
+
- features/support/env.rb
|
131
|
+
- lib/sphinxcrawl.rb
|
132
|
+
- lib/sphinxcrawl/crawler.rb
|
133
|
+
- lib/sphinxcrawl/file_crawler.rb
|
134
|
+
- lib/sphinxcrawl/page.rb
|
135
|
+
- lib/sphinxcrawl/stream.rb
|
136
|
+
- lib/sphinxcrawl/version.rb
|
137
|
+
- lib/sphinxcrawl/web_crawler.rb
|
138
|
+
- spec/fixtures/child_file.html
|
139
|
+
- spec/fixtures/child_web.html
|
140
|
+
- spec/fixtures/index_file.html
|
141
|
+
- spec/fixtures/index_web.html
|
142
|
+
- spec/fixtures/tree_file.html
|
143
|
+
- spec/fixtures/tree_web.html
|
144
|
+
- spec/spec_helper.rb
|
145
|
+
- spec/sphinxcrawl/file_crawler_spec.rb
|
146
|
+
- spec/sphinxcrawl/page_spec.rb
|
147
|
+
- spec/sphinxcrawl/stream_spec.rb
|
148
|
+
- spec/sphinxcrawl/web_crawler_spec.rb
|
149
|
+
- sphinxcrawl.gemspec
|
150
|
+
homepage: https://github.com/hatoishi/sphinxcrawl
|
151
|
+
licenses: []
|
152
|
+
post_install_message:
|
153
|
+
rdoc_options: []
|
154
|
+
require_paths:
|
155
|
+
- lib
|
156
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
157
|
+
none: false
|
158
|
+
requirements:
|
159
|
+
- - ! '>='
|
160
|
+
- !ruby/object:Gem::Version
|
161
|
+
version: '0'
|
162
|
+
segments:
|
163
|
+
- 0
|
164
|
+
hash: 3810366116509016959
|
165
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
166
|
+
none: false
|
167
|
+
requirements:
|
168
|
+
- - ! '>='
|
169
|
+
- !ruby/object:Gem::Version
|
170
|
+
version: '0'
|
171
|
+
segments:
|
172
|
+
- 0
|
173
|
+
hash: 3810366116509016959
|
174
|
+
requirements: []
|
175
|
+
rubyforge_project:
|
176
|
+
rubygems_version: 1.8.24
|
177
|
+
signing_key:
|
178
|
+
specification_version: 3
|
179
|
+
summary: Simple command to crawl a site and process html into s sphinx xmlstream
|
180
|
+
test_files:
|
181
|
+
- features/sphinxcrawl.feature
|
182
|
+
- features/step_definitions/sphinxcrawl_steps.rb
|
183
|
+
- features/support/env.rb
|
184
|
+
- spec/fixtures/child_file.html
|
185
|
+
- spec/fixtures/child_web.html
|
186
|
+
- spec/fixtures/index_file.html
|
187
|
+
- spec/fixtures/index_web.html
|
188
|
+
- spec/fixtures/tree_file.html
|
189
|
+
- spec/fixtures/tree_web.html
|
190
|
+
- spec/spec_helper.rb
|
191
|
+
- spec/sphinxcrawl/file_crawler_spec.rb
|
192
|
+
- spec/sphinxcrawl/page_spec.rb
|
193
|
+
- spec/sphinxcrawl/stream_spec.rb
|
194
|
+
- spec/sphinxcrawl/web_crawler_spec.rb
|