sphinxcrawl 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ .DS_Store
2
+ results.html
3
+ pkg
4
+ html
5
+ .bundle
6
+ tags
7
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,17 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in sphinxcrawl.gemspec
4
+ gemspec
5
+
6
+ gem 'guard'
7
+ gem 'guard-rspec'
8
+ gem 'guard-ctags-bundler'
9
+ gem 'guard-cucumber'
10
+
11
+ group :darwin do
12
+ gem 'rb-fsevent'
13
+ end
14
+
15
+ group :linux do
16
+ gem 'rb-inotify'
17
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,89 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ sphinxcrawl (0.0.1)
5
+ methadone (~> 1.2.2)
6
+ nokogiri (~> 1.5.5)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ addressable (2.3.2)
12
+ aruba (0.5.0)
13
+ childprocess (= 0.2.3)
14
+ cucumber (>= 1.1.1)
15
+ ffi (>= 1.0.11)
16
+ rspec-expectations (>= 2.7.0)
17
+ builder (3.1.4)
18
+ childprocess (0.2.3)
19
+ ffi (~> 1.0.6)
20
+ coderay (1.0.8)
21
+ crack (0.3.1)
22
+ cucumber (1.2.1)
23
+ builder (>= 2.1.2)
24
+ diff-lcs (>= 1.1.3)
25
+ gherkin (~> 2.11.0)
26
+ json (>= 1.4.6)
27
+ diff-lcs (1.1.3)
28
+ ffi (1.0.11)
29
+ gherkin (2.11.5)
30
+ json (>= 1.4.6)
31
+ guard (1.5.3)
32
+ listen (>= 0.4.2)
33
+ lumberjack (>= 1.0.2)
34
+ pry (>= 0.9.10)
35
+ thor (>= 0.14.6)
36
+ guard-ctags-bundler (0.1.6)
37
+ guard (>= 1.1)
38
+ guard-cucumber (1.2.2)
39
+ cucumber (>= 1.2.0)
40
+ guard (>= 1.1.0)
41
+ guard-rspec (2.1.1)
42
+ guard (>= 1.1)
43
+ rspec (~> 2.11)
44
+ json (1.7.5)
45
+ listen (0.5.3)
46
+ lumberjack (1.0.2)
47
+ methadone (1.2.2)
48
+ bundler
49
+ method_source (0.8.1)
50
+ nokogiri (1.5.5)
51
+ pry (0.9.10)
52
+ coderay (~> 1.0.5)
53
+ method_source (~> 0.8)
54
+ slop (~> 3.3.1)
55
+ rake (0.9.2.2)
56
+ rb-fsevent (0.9.2)
57
+ rb-inotify (0.8.8)
58
+ ffi (>= 0.5.0)
59
+ rdoc (3.12)
60
+ json (~> 1.4)
61
+ rspec (2.11.0)
62
+ rspec-core (~> 2.11.0)
63
+ rspec-expectations (~> 2.11.0)
64
+ rspec-mocks (~> 2.11.0)
65
+ rspec-core (2.11.1)
66
+ rspec-expectations (2.11.3)
67
+ diff-lcs (~> 1.1.3)
68
+ rspec-mocks (2.11.3)
69
+ slop (3.3.3)
70
+ thor (0.16.0)
71
+ webmock (1.8.10)
72
+ addressable (>= 2.2.7)
73
+ crack (>= 0.1.7)
74
+
75
+ PLATFORMS
76
+ ruby
77
+
78
+ DEPENDENCIES
79
+ aruba
80
+ guard
81
+ guard-ctags-bundler
82
+ guard-cucumber
83
+ guard-rspec
84
+ rake
85
+ rb-fsevent
86
+ rb-inotify
87
+ rdoc
88
+ sphinxcrawl!
89
+ webmock
data/Guardfile ADDED
@@ -0,0 +1,16 @@
1
+ guard 'rspec' do
2
+ watch(%r{^spec/.+_spec\.rb$})
3
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
4
+ watch('spec/spec_helper.rb') { 'spec' }
5
+ end
6
+
7
+ guard 'ctags-bundler', :src_path => ['lib', 'spec'] do
8
+ watch(/^(lib)\/.*\.rb$/)
9
+ watch('Gemfile.lock')
10
+ end
11
+
12
+ guard 'cucumber' do
13
+ watch(%r{^features/.+\.feature$})
14
+ watch(%r{^features/support/.+$}) { 'features' }
15
+ watch(%r{^features/step_definitions/(.+)_steps\.rb$}) { |m| Dir[File.join("**/#{m[1]}.feature")][0] || 'features' }
16
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Jon Doveston
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.markdown ADDED
@@ -0,0 +1,69 @@
1
+ # Sphinxcrawl
2
+
3
+ Simple command line tool gem to crawl a website and generate an xml stream for a
4
+ sphinx search index.
5
+
6
+ ## Installation
7
+
8
+ Install it with:
9
+
10
+ $ gem install sphinxcrawl
11
+
12
+ ## Usage
13
+
14
+ After installation the sphinxcrawl command should be available.
15
+
16
+ Create a sphinx.conf file, for example
17
+
18
+
19
+ source page
20
+ {
21
+ type = xmlpipe2
22
+ xmlpipe_command = sphinxcrawl http://www.example.com -d 2 2>/dev/null
23
+ }
24
+
25
+ index page
26
+ {
27
+ source = page
28
+ path = sphinx/page
29
+ morphology = stem_en
30
+ charset_type = utf-8
31
+ html_strip = 1
32
+ }
33
+
34
+ Install sphinx, with debianoid linuxes this is
35
+
36
+ sudo apt-get install sphinxsearch
37
+
38
+ You should now have the indexer and search commands
39
+
40
+ indexer -c sphinx.conf page
41
+
42
+ and you can search in the index with
43
+
44
+ search -c sphinx.conf name
45
+
46
+ ## Requirements
47
+
48
+ This gem is not Google! It uses nokogiri to parse html so will fail on badly
49
+ formed html. Also more importantly it only indexes part of the page marked with
50
+ a specific html attribute. This means you can only index sites that you control.
51
+ For example
52
+
53
+ ...
54
+ <div data-field="description">
55
+ <p>This is a description</p>
56
+ </div>
57
+ ...
58
+
59
+ will index the text inside the div tag (stripping other tags) and put the text
60
+ in a sphinx field called description. All fields are aggregated from all the
61
+ pages from a site crawl.
62
+
63
+ ## Contributing
64
+
65
+ 1. Fork it
66
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
67
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
68
+ 4. Push to the branch (`git push origin my-new-feature`)
69
+ 5. Create new Pull Request
data/README.rdoc ADDED
@@ -0,0 +1,13 @@
1
+ = sphinxcrawl - html crawler and sphinx xmlstream generator
2
+
3
+ Author:: Jon Doveston (jon@doveston.me.uk)
4
+ Copyright:: Copyright (c) 2012 Jon Doveston
5
+
6
+
7
+ A command line gem that can crawl html (files or http) and generate an xml
8
+ stream for sphinx search.
9
+
10
+ == Links
11
+
12
+ * {Source on Github}[https://github.com/hatoishi/sphinxcrawl]
13
+
data/Rakefile ADDED
@@ -0,0 +1,66 @@
1
+ def dump_load_path
2
+ puts $LOAD_PATH.join("\n")
3
+ found = nil
4
+ $LOAD_PATH.each do |path|
5
+ if File.exists?(File.join(path,"rspec"))
6
+ puts "Found rspec in #{path}"
7
+ if File.exists?(File.join(path,"rspec","core"))
8
+ puts "Found core"
9
+ if File.exists?(File.join(path,"rspec","core","rake_task"))
10
+ puts "Found rake_task"
11
+ found = path
12
+ else
13
+ puts "!! no rake_task"
14
+ end
15
+ else
16
+ puts "!!! no core"
17
+ end
18
+ end
19
+ end
20
+ if found.nil?
21
+ puts "Didn't find rspec/core/rake_task anywhere"
22
+ else
23
+ puts "Found in #{path}"
24
+ end
25
+ end
26
+ require 'bundler'
27
+ require 'rake/clean'
28
+
29
+ begin
30
+ require 'rspec/core/rake_task'
31
+ rescue LoadError
32
+ dump_load_path
33
+ raise
34
+ end
35
+
36
+ require 'cucumber'
37
+ require 'cucumber/rake/task'
38
+ gem 'rdoc' # we need the installed RDoc gem, not the system one
39
+ require 'rdoc/task'
40
+
41
+ include Rake::DSL
42
+
43
+ Bundler::GemHelper.install_tasks
44
+
45
+
46
+ RSpec::Core::RakeTask.new do |t|
47
+ # Put spec opts in a file named .rspec in root
48
+ end
49
+
50
+
51
+ CUKE_RESULTS = 'results.html'
52
+ CLEAN << CUKE_RESULTS
53
+ Cucumber::Rake::Task.new(:features) do |t|
54
+ t.cucumber_opts = "features --format html -o #{CUKE_RESULTS} --format pretty --no-source -x"
55
+ t.fork = false
56
+ end
57
+
58
+ Rake::RDocTask.new do |rd|
59
+
60
+ rd.main = "README.rdoc"
61
+
62
+ rd.rdoc_files.include("README.rdoc","lib/**/*.rb","bin/**/*")
63
+ end
64
+
65
+ task :default => [:spec,:features]
66
+
data/bin/sphinxcrawl ADDED
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'methadone'
5
+ require 'sphinxcrawl'
6
+
7
+ class App
8
+ include Methadone::Main
9
+ include Methadone::CLILogging
10
+
11
+ main do |index|
12
+ index ||= 'index.html'
13
+ if index =~ /http:/
14
+ crawler = Sphinxcrawl::WebCrawler.new(index, options[:depth].to_i)
15
+ else
16
+ exit_now!(1, "file #{index} not found") unless File.exists?(index)
17
+ index = index+'/index.html' if File.directory?(index)
18
+ exit_now!(1, "file #{index} not found") unless File.exists?(index)
19
+ crawler = Sphinxcrawl::FileCrawler.new(index, options[:depth].to_i)
20
+ end
21
+ pages = crawler.pages
22
+ stream = Sphinxcrawl::Stream.new(pages)
23
+ puts stream.to_xml
24
+ end
25
+
26
+ description 'Generate sphinx xml stream from html pages'
27
+ arg :index, :optional
28
+
29
+ options['depth'] = '0'
30
+ on('-d depth', '--depth', 'Crawl depth', /^\d+$/)
31
+
32
+ version Sphinxcrawl::VERSION
33
+ use_log_level_option
34
+ go!
35
+ end
data/example.conf ADDED
@@ -0,0 +1,14 @@
1
+ source page
2
+ {
3
+ type = xmlpipe2
4
+ xmlpipe_command = bundle exec ./bin/sphinxcrawl spec/fixtures/index_file.html -d 2 2>/dev/null
5
+ }
6
+
7
+ index page
8
+ {
9
+ source = page
10
+ path = tmp/page
11
+ morphology = stem_en
12
+ charset_type = utf-8
13
+ html_strip = 1
14
+ }
@@ -0,0 +1,70 @@
1
+ Feature: Generation of sphinx xml
2
+ Scenario: help page
3
+ When I get help for "sphinxcrawl"
4
+ Then the exit status should be 0
5
+ And the banner should be present
6
+ And the banner should document that this app takes options
7
+ And the following options should be documented:
8
+ |--version|
9
+
10
+ Scenario: xml for a nonexistent file
11
+ When I run `sphinxcrawl spec/fixtures/blah.html`
12
+ Then the exit status should be 1
13
+
14
+ Scenario: xml for a nonexistent index
15
+ When I run `sphinxcrawl spec`
16
+ Then the exit status should be 1
17
+
18
+ Scenario: xml for a single file
19
+ Given a file named "test.html" with:
20
+ """
21
+ <html>
22
+ <head>
23
+ </head>
24
+ <body>
25
+ <h1 data-field="name">My name</h1>
26
+ <p data-field="description">My description</p>
27
+ <p data-field="body"><span>My</span><span>body</span></p>
28
+ </body>
29
+ </html>
30
+ """
31
+ When I run `sphinxcrawl test.html`
32
+ Then the exit status should be 0
33
+ And the output should contain "name"
34
+ And the output should contain "description"
35
+ And the output should contain "body"
36
+
37
+ Scenario: index the xml
38
+ Given a file named "test.html" with:
39
+ """
40
+ <html>
41
+ <head>
42
+ </head>
43
+ <body>
44
+ <h1 data-field="name">My name</h1>
45
+ <p data-field="description">My description</p>
46
+ <p data-field="body"><span>My</span><span>body</span></p>
47
+ </body>
48
+ </html>
49
+ """
50
+ And a file named "test.conf" with:
51
+ """
52
+ indexer
53
+ {
54
+ }
55
+ source page
56
+ {
57
+ type = xmlpipe2
58
+ xmlpipe_command = ../../bin/sphinxcrawl test.html 2>/dev/null
59
+ }
60
+ index page
61
+ {
62
+ source = page
63
+ path = page
64
+ morphology = stem_en
65
+ charset_type = utf-8
66
+ html_strip = 1
67
+ }
68
+ """
69
+ When I run `indexer -c test.conf page`
70
+ Then the exit status should be 0
@@ -0,0 +1 @@
1
+ # Put your step definitions here
@@ -0,0 +1,16 @@
1
+ require 'aruba/cucumber'
2
+ require 'methadone/cucumber'
3
+
4
+ ENV['PATH'] = "#{File.expand_path(File.dirname(__FILE__) + '/../../bin')}#{File::PATH_SEPARATOR}#{ENV['PATH']}"
5
+ LIB_DIR = File.join(File.expand_path(File.dirname(__FILE__)),'..','..','lib')
6
+
7
+ Before do
8
+ # Using "announce" causes massive warnings on 1.9.2
9
+ @puts = true
10
+ @original_rubylib = ENV['RUBYLIB']
11
+ ENV['RUBYLIB'] = LIB_DIR + File::PATH_SEPARATOR + ENV['RUBYLIB'].to_s
12
+ end
13
+
14
+ After do
15
+ ENV['RUBYLIB'] = @original_rubylib
16
+ end
@@ -0,0 +1,9 @@
1
+ require 'sphinxcrawl/version'
2
+ require 'sphinxcrawl/page'
3
+ require 'sphinxcrawl/stream'
4
+ require 'sphinxcrawl/crawler'
5
+ require 'sphinxcrawl/file_crawler'
6
+ require 'sphinxcrawl/web_crawler'
7
+
8
+ module Sphinxcrawl
9
+ end
@@ -0,0 +1,36 @@
1
+ require 'set'
2
+
3
+ module Sphinxcrawl
4
+ class Crawler
5
+ attr_reader :depth
6
+
7
+ def initialize(depth=0)
8
+ @depth = depth
9
+ end
10
+
11
+ def pages
12
+ return @pages if @pages
13
+ return [] unless index
14
+ @pages = Set.new([index])
15
+ return @pages if @depth == 0
16
+
17
+ current_pages = Set.new([index])
18
+ depth.times do
19
+ links = current_pages.map(&:links).flatten.compact.uniq
20
+ current_pages = Set.new(links.map{ |url| get_page(url) }.compact) - @pages
21
+ @pages += current_pages
22
+ end
23
+ @pages
24
+ end
25
+
26
+ private
27
+
28
+ def index
29
+ nil
30
+ end
31
+
32
+ def get_page(url)
33
+ nil
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,20 @@
1
+ module Sphinxcrawl
2
+ class FileCrawler < Crawler
3
+ def initialize(index_file_name, depth=0)
4
+ @directory = File.dirname(index_file_name)
5
+ @basename = File.basename(index_file_name)
6
+ super(depth)
7
+ end
8
+
9
+ private
10
+
11
+ def index
12
+ @index ||= get_page(@basename)
13
+ end
14
+
15
+ def get_page(url)
16
+ html = File.read(@directory + '/' + url) rescue nil
17
+ Page.new(url, html) if html
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,58 @@
1
+ require 'nokogiri'
2
+
3
+ module Sphinxcrawl
4
+ class Page
5
+ attr_reader :html, :url
6
+
7
+ def initialize(url, html)
8
+ @url = url
9
+ @html = html
10
+ @field_data = {}
11
+ end
12
+
13
+ def empty?
14
+ !html || html.empty?
15
+ end
16
+
17
+ def field_names
18
+ fields.map do |n|
19
+ n.attribute('data-field').value
20
+ end.uniq
21
+ end
22
+
23
+ def field(name)
24
+ @field_data[name] ||= get_field(name)
25
+ end
26
+
27
+ def links
28
+ @links ||= document.xpath('//a/@href').map(&:value).map do |url|
29
+ # only local links (no http://)
30
+ uri = URI.parse(url)
31
+ uri.host.nil? && uri.scheme.nil? ? url : nil
32
+ end.compact
33
+ end
34
+
35
+ def eql?(compare)
36
+ url == compare.url
37
+ end
38
+ alias :== :eql?
39
+
40
+ def hash
41
+ url.hash
42
+ end
43
+
44
+ private
45
+
46
+ def get_field(name)
47
+ document.xpath("//*[@data-field='#{name}']//text()").map(&:content).join(' ')
48
+ end
49
+
50
+ def fields
51
+ @fields ||= document.xpath('//*[@data-field]')
52
+ end
53
+
54
+ def document
55
+ @document ||= Nokogiri::HTML(html)
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,85 @@
1
+ require 'rexml/document'
2
+
3
+ module Sphinxcrawl
4
+ class Stream
5
+ include REXML
6
+
7
+ attr_reader :pages
8
+
9
+ def initialize(pages)
10
+ @pages = Array(pages)
11
+ @count = 0
12
+ end
13
+
14
+ def empty?
15
+ pages.length == 0
16
+ end
17
+
18
+ def number_of_pages
19
+ pages.length
20
+ end
21
+
22
+ def field_names
23
+ @field_names ||= pages.map(&:field_names).flatten.uniq
24
+ end
25
+
26
+ def to_xml
27
+ add_fields
28
+ add_documents
29
+ ''.tap { |xml| document.write(xml) }
30
+ end
31
+
32
+ private
33
+
34
+ def next_id
35
+ @count += 1
36
+ end
37
+
38
+ def add_fields
39
+ field_names.each do |name|
40
+ add_field(name)
41
+ end
42
+ end
43
+
44
+ def add_documents
45
+ pages.each do |page|
46
+ add_document(page)
47
+ end
48
+ end
49
+
50
+ def add_field(name)
51
+ schema.add_element('sphinx:field').tap do |field|
52
+ field.add_attribute('name', name)
53
+ end
54
+ end
55
+
56
+ def add_document(page)
57
+ if page.field_names.length > 0
58
+ root.add_element('sphinx:document').tap do |doc|
59
+ doc.add_attribute('id', next_id)
60
+ doc.add_element('url').text = CData.new(page.url)
61
+ field_names.each do |name|
62
+ doc.add_element(name).text = CData.new(page.field(name))
63
+ end
64
+ end
65
+ end
66
+ end
67
+
68
+ def document
69
+ @document ||= Document.new.tap do |doc|
70
+ doc << XMLDecl.new('1.0', 'UTF-8')
71
+ @docset = doc.add_element('sphinx:docset')
72
+ end
73
+ end
74
+
75
+ def root
76
+ document.root
77
+ end
78
+
79
+ def schema
80
+ @schema ||= root.add_element('sphinx:schema').tap do |sch|
81
+ sch.add_element('sphinx:attr').add_attributes('name' => 'url', 'type' => 'string')
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,3 @@
1
+ module Sphinxcrawl
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,43 @@
1
+ require 'uri'
2
+ require 'net/http'
3
+
4
+ module Sphinxcrawl
5
+ class WebCrawler < Crawler
6
+ def initialize(url, depth=0)
7
+ @uri = URI.parse(url)
8
+ super(depth)
9
+ end
10
+
11
+ private
12
+
13
+ def host
14
+ @uri.host
15
+ end
16
+
17
+ def port
18
+ @uri.port
19
+ end
20
+
21
+ def index_path
22
+ @uri.path
23
+ end
24
+
25
+ def http
26
+ Net::HTTP.new(host, port)
27
+ end
28
+
29
+ def index
30
+ @index ||= get_page(index_path)
31
+ end
32
+
33
+ def get_page(path)
34
+ path = path && !path.empty? ? path : '/'
35
+ location = path
36
+ begin
37
+ response = http.request(Net::HTTP::Get.new(location))
38
+ end while location = response.header['location']
39
+ html = response.body
40
+ Page.new(path, html) if html
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,11 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <h1 data-field="name">My name is child</h1>
6
+ <p data-field="description">My description is child</p>
7
+ <p data-field="body"><span>My</span><span>body</span>is child</p>
8
+ <a href="index_file.html">index</a>
9
+ <a href="tree_file.html">tree</a>
10
+ </body>
11
+ </html>
@@ -0,0 +1,11 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <h1 data-field="name">My name is child</h1>
6
+ <p data-field="description">My description is child</p>
7
+ <p data-field="body"><span>My</span><span>body</span>is child</p>
8
+ <a href="/index_web.html">index</a>
9
+ <a href="/tree_web.html">tree</a>
10
+ </body>
11
+ </html>
@@ -0,0 +1,12 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <h1 data-field="name">My name</h1>
6
+ <p data-field="description">My description</p>
7
+ <p data-field="body"><span>My</span><span>body</span></p>
8
+ <a href="tree_file.html">tree</a>
9
+ <a href="index_file.html">index</a>
10
+ <a href="http://www.google.com">google</a>
11
+ </body>
12
+ </html>
@@ -0,0 +1,11 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <h1 data-field="name">My name</h1>
6
+ <p data-field="description">My description</p>
7
+ <p data-field="body"><span>My</span><span>body</span></p>
8
+ <a href="/tree_web.html">tree</a>
9
+ <a href="/index_web.html">index</a>
10
+ </body>
11
+ </html>
@@ -0,0 +1,12 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <h1 data-field="name">My name is tree</h1>
6
+ <p data-field="description">My description is tree</p>
7
+ <p data-field="body"><span>My</span><span>body</span>is tree</p>
8
+ <a href="index_file.html">index</a>
9
+ <a href="child_file.html">child</a>
10
+ <a href="tree_file.html">tree</a>
11
+ </body>
12
+ </html>
@@ -0,0 +1,12 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <h1 data-field="name">My name is tree</h1>
6
+ <p data-field="description">My description is tree</p>
7
+ <p data-field="body"><span>My</span><span>body</span>is tree</p>
8
+ <a href="/index_web.html">index</a>
9
+ <a href="/child_web.html">child</a>
10
+ <a href="/tree_web.html">tree</a>
11
+ </body>
12
+ </html>
@@ -0,0 +1,2 @@
1
+ require 'sphinxcrawl'
2
+ require 'webmock/rspec'
@@ -0,0 +1,23 @@
1
+ require 'spec_helper'
2
+
3
+ describe Sphinxcrawl::FileCrawler do
4
+ let(:dir) { 'spec/fixtures' }
5
+ let(:index) { 'index_file.html' }
6
+ let(:html) { dir + '/' + index }
7
+ subject { Sphinxcrawl::FileCrawler.new(html) }
8
+
9
+ its(:depth) { should be == 0 }
10
+ specify { subject.pages.length.should be == 1 }
11
+
12
+ context "with a depth of 1" do
13
+ subject { Sphinxcrawl::FileCrawler.new(html, 1) }
14
+ its(:depth) { should be == 1 }
15
+ specify { subject.pages.length.should be == 2 }
16
+ end
17
+
18
+ context "with a depth of 2" do
19
+ subject { Sphinxcrawl::FileCrawler.new(html, 2) }
20
+ its(:depth) { should be == 2 }
21
+ specify { subject.pages.length.should be == 3 }
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ require 'spec_helper'
2
+
3
+ describe Sphinxcrawl::Page do
4
+ let(:html) { File.read('spec/fixtures/index_file.html') }
5
+ let(:same) { Sphinxcrawl::Page.new(subject.url, 'blah') }
6
+ subject { Sphinxcrawl::Page.new('index_file.html', html) }
7
+
8
+ its(:html) { should be == html }
9
+ it { should_not be_empty }
10
+
11
+ its(:field_names) { should be == %w[name description body] }
12
+ specify { subject.field('name').should be == 'My name' }
13
+ specify { subject.field('description').should be == 'My description' }
14
+ specify { subject.field('body').should be == 'My body' }
15
+
16
+ its(:links) { should be == ['tree_file.html', 'index_file.html'] }
17
+
18
+ specify { subject.should be == same }
19
+ specify { subject.should eq(same) }
20
+ specify { subject.should eql(same) }
21
+ specify { subject.should_not equal same }
22
+ specify { [subject, same].uniq.length.should be == 1 }
23
+ end
@@ -0,0 +1,20 @@
1
+ require 'spec_helper'
2
+
3
+ describe Sphinxcrawl::Stream do
4
+ let(:html) { File.read('spec/fixtures/index_file.html') }
5
+ let(:page) { Sphinxcrawl::Page.new('index_file.html', html) }
6
+ subject { Sphinxcrawl::Stream.new(page) }
7
+ let(:xml) { subject.to_xml }
8
+
9
+ it { should_not be_empty }
10
+ its(:number_of_pages) { should be == 1 }
11
+
12
+ its(:field_names) { should be == %w[name description body] }
13
+
14
+ its(:to_xml) { should include('name') }
15
+ its(:to_xml) { should include('description') }
16
+ its(:to_xml) { should include('body') }
17
+ its(:to_xml) { should include('My name') }
18
+ its(:to_xml) { should include('My description') }
19
+ its(:to_xml) { should include('My body') }
20
+ end
@@ -0,0 +1,32 @@
1
+ require 'spec_helper'
2
+
3
+ describe Sphinxcrawl::WebCrawler do
4
+ let(:dir) { 'spec/fixtures' }
5
+ let(:domain) { 'http://www.example.com' }
6
+ let(:index) { 'index_web.html' }
7
+ let(:tree) { 'tree_web.html' }
8
+ let(:child) { 'child_web.html' }
9
+
10
+ before do
11
+ stub_request(:get, domain+'/'+index).to_return(:body => File.new(dir+'/'+index), :status => 200)
12
+ stub_request(:get, domain+'/'+tree).to_return(:body => File.new(dir+'/'+tree), :status => 200)
13
+ stub_request(:get, domain+'/'+child).to_return(:body => File.new(dir+'/'+child), :status => 200)
14
+ end
15
+
16
+ subject { Sphinxcrawl::WebCrawler.new(domain+'/'+index) }
17
+
18
+ its(:depth) { should be == 0 }
19
+ specify { subject.pages.length.should be == 1 }
20
+
21
+ context "with a depth of 1" do
22
+ subject { Sphinxcrawl::WebCrawler.new(domain+'/'+index, 1) }
23
+ its(:depth) { should be == 1 }
24
+ specify { subject.pages.length.should be == 2 }
25
+ end
26
+
27
+ context "with a depth of 2" do
28
+ subject { Sphinxcrawl::WebCrawler.new(domain+'/'+index, 2) }
29
+ its(:depth) { should be == 2 }
30
+ specify { subject.pages.length.should be == 3 }
31
+ end
32
+ end
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'sphinxcrawl/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = 'sphinxcrawl'
8
+ gem.version = Sphinxcrawl::VERSION
9
+ gem.authors = ['Jon Doveston']
10
+ gem.email = ['jon@doveston.me.uk']
11
+ gem.description = %q{Simple command to crawl a site and process html into a sphinx xmlstream}
12
+ gem.summary = %q{Simple command to crawl a site and process html into s sphinx xmlstream}
13
+ gem.homepage = 'https://github.com/hatoishi/sphinxcrawl'
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ['lib']
19
+ gem.add_dependency('methadone', '~> 1.2.2')
20
+ gem.add_dependency('nokogiri', '~> 1.5.5')
21
+
22
+ gem.add_development_dependency('rdoc')
23
+ gem.add_development_dependency('aruba')
24
+ gem.add_development_dependency('rake')
25
+ gem.add_development_dependency('webmock')
26
+ end
metadata ADDED
@@ -0,0 +1,194 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sphinxcrawl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jon Doveston
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-11-03 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: methadone
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 1.2.2
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 1.2.2
30
+ - !ruby/object:Gem::Dependency
31
+ name: nokogiri
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 1.5.5
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 1.5.5
46
+ - !ruby/object:Gem::Dependency
47
+ name: rdoc
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: aruba
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: rake
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: webmock
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ description: Simple command to crawl a site and process html into a sphinx xmlstream
111
+ email:
112
+ - jon@doveston.me.uk
113
+ executables:
114
+ - sphinxcrawl
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - .gitignore
119
+ - Gemfile
120
+ - Gemfile.lock
121
+ - Guardfile
122
+ - LICENSE.txt
123
+ - README.markdown
124
+ - README.rdoc
125
+ - Rakefile
126
+ - bin/sphinxcrawl
127
+ - example.conf
128
+ - features/sphinxcrawl.feature
129
+ - features/step_definitions/sphinxcrawl_steps.rb
130
+ - features/support/env.rb
131
+ - lib/sphinxcrawl.rb
132
+ - lib/sphinxcrawl/crawler.rb
133
+ - lib/sphinxcrawl/file_crawler.rb
134
+ - lib/sphinxcrawl/page.rb
135
+ - lib/sphinxcrawl/stream.rb
136
+ - lib/sphinxcrawl/version.rb
137
+ - lib/sphinxcrawl/web_crawler.rb
138
+ - spec/fixtures/child_file.html
139
+ - spec/fixtures/child_web.html
140
+ - spec/fixtures/index_file.html
141
+ - spec/fixtures/index_web.html
142
+ - spec/fixtures/tree_file.html
143
+ - spec/fixtures/tree_web.html
144
+ - spec/spec_helper.rb
145
+ - spec/sphinxcrawl/file_crawler_spec.rb
146
+ - spec/sphinxcrawl/page_spec.rb
147
+ - spec/sphinxcrawl/stream_spec.rb
148
+ - spec/sphinxcrawl/web_crawler_spec.rb
149
+ - sphinxcrawl.gemspec
150
+ homepage: https://github.com/hatoishi/sphinxcrawl
151
+ licenses: []
152
+ post_install_message:
153
+ rdoc_options: []
154
+ require_paths:
155
+ - lib
156
+ required_ruby_version: !ruby/object:Gem::Requirement
157
+ none: false
158
+ requirements:
159
+ - - ! '>='
160
+ - !ruby/object:Gem::Version
161
+ version: '0'
162
+ segments:
163
+ - 0
164
+ hash: 3810366116509016959
165
+ required_rubygems_version: !ruby/object:Gem::Requirement
166
+ none: false
167
+ requirements:
168
+ - - ! '>='
169
+ - !ruby/object:Gem::Version
170
+ version: '0'
171
+ segments:
172
+ - 0
173
+ hash: 3810366116509016959
174
+ requirements: []
175
+ rubyforge_project:
176
+ rubygems_version: 1.8.24
177
+ signing_key:
178
+ specification_version: 3
179
+ summary: Simple command to crawl a site and process html into s sphinx xmlstream
180
+ test_files:
181
+ - features/sphinxcrawl.feature
182
+ - features/step_definitions/sphinxcrawl_steps.rb
183
+ - features/support/env.rb
184
+ - spec/fixtures/child_file.html
185
+ - spec/fixtures/child_web.html
186
+ - spec/fixtures/index_file.html
187
+ - spec/fixtures/index_web.html
188
+ - spec/fixtures/tree_file.html
189
+ - spec/fixtures/tree_web.html
190
+ - spec/spec_helper.rb
191
+ - spec/sphinxcrawl/file_crawler_spec.rb
192
+ - spec/sphinxcrawl/page_spec.rb
193
+ - spec/sphinxcrawl/stream_spec.rb
194
+ - spec/sphinxcrawl/web_crawler_spec.rb