sphinxcrawl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ .DS_Store
2
+ results.html
3
+ pkg
4
+ html
5
+ .bundle
6
+ tags
7
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,17 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in sphinxcrawl.gemspec
4
+ gemspec
5
+
6
+ gem 'guard'
7
+ gem 'guard-rspec'
8
+ gem 'guard-ctags-bundler'
9
+ gem 'guard-cucumber'
10
+
11
+ group :darwin do
12
+ gem 'rb-fsevent'
13
+ end
14
+
15
+ group :linux do
16
+ gem 'rb-inotify'
17
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,89 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ sphinxcrawl (0.0.1)
5
+ methadone (~> 1.2.2)
6
+ nokogiri (~> 1.5.5)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ addressable (2.3.2)
12
+ aruba (0.5.0)
13
+ childprocess (= 0.2.3)
14
+ cucumber (>= 1.1.1)
15
+ ffi (>= 1.0.11)
16
+ rspec-expectations (>= 2.7.0)
17
+ builder (3.1.4)
18
+ childprocess (0.2.3)
19
+ ffi (~> 1.0.6)
20
+ coderay (1.0.8)
21
+ crack (0.3.1)
22
+ cucumber (1.2.1)
23
+ builder (>= 2.1.2)
24
+ diff-lcs (>= 1.1.3)
25
+ gherkin (~> 2.11.0)
26
+ json (>= 1.4.6)
27
+ diff-lcs (1.1.3)
28
+ ffi (1.0.11)
29
+ gherkin (2.11.5)
30
+ json (>= 1.4.6)
31
+ guard (1.5.3)
32
+ listen (>= 0.4.2)
33
+ lumberjack (>= 1.0.2)
34
+ pry (>= 0.9.10)
35
+ thor (>= 0.14.6)
36
+ guard-ctags-bundler (0.1.6)
37
+ guard (>= 1.1)
38
+ guard-cucumber (1.2.2)
39
+ cucumber (>= 1.2.0)
40
+ guard (>= 1.1.0)
41
+ guard-rspec (2.1.1)
42
+ guard (>= 1.1)
43
+ rspec (~> 2.11)
44
+ json (1.7.5)
45
+ listen (0.5.3)
46
+ lumberjack (1.0.2)
47
+ methadone (1.2.2)
48
+ bundler
49
+ method_source (0.8.1)
50
+ nokogiri (1.5.5)
51
+ pry (0.9.10)
52
+ coderay (~> 1.0.5)
53
+ method_source (~> 0.8)
54
+ slop (~> 3.3.1)
55
+ rake (0.9.2.2)
56
+ rb-fsevent (0.9.2)
57
+ rb-inotify (0.8.8)
58
+ ffi (>= 0.5.0)
59
+ rdoc (3.12)
60
+ json (~> 1.4)
61
+ rspec (2.11.0)
62
+ rspec-core (~> 2.11.0)
63
+ rspec-expectations (~> 2.11.0)
64
+ rspec-mocks (~> 2.11.0)
65
+ rspec-core (2.11.1)
66
+ rspec-expectations (2.11.3)
67
+ diff-lcs (~> 1.1.3)
68
+ rspec-mocks (2.11.3)
69
+ slop (3.3.3)
70
+ thor (0.16.0)
71
+ webmock (1.8.10)
72
+ addressable (>= 2.2.7)
73
+ crack (>= 0.1.7)
74
+
75
+ PLATFORMS
76
+ ruby
77
+
78
+ DEPENDENCIES
79
+ aruba
80
+ guard
81
+ guard-ctags-bundler
82
+ guard-cucumber
83
+ guard-rspec
84
+ rake
85
+ rb-fsevent
86
+ rb-inotify
87
+ rdoc
88
+ sphinxcrawl!
89
+ webmock
data/Guardfile ADDED
@@ -0,0 +1,16 @@
1
+ guard 'rspec' do
2
+ watch(%r{^spec/.+_spec\.rb$})
3
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
4
+ watch('spec/spec_helper.rb') { 'spec' }
5
+ end
6
+
7
+ guard 'ctags-bundler', :src_path => ['lib', 'spec'] do
8
+ watch(/^(lib)\/.*\.rb$/)
9
+ watch('Gemfile.lock')
10
+ end
11
+
12
+ guard 'cucumber' do
13
+ watch(%r{^features/.+\.feature$})
14
+ watch(%r{^features/support/.+$}) { 'features' }
15
+ watch(%r{^features/step_definitions/(.+)_steps\.rb$}) { |m| Dir[File.join("**/#{m[1]}.feature")][0] || 'features' }
16
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Jon Doveston
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.markdown ADDED
@@ -0,0 +1,69 @@
1
+ # Sphinxcrawl
2
+
3
+ Simple command line tool gem to crawl a website and generate an xml stream for a
4
+ sphinx search index.
5
+
6
+ ## Installation
7
+
8
+ Install it with:
9
+
10
+ $ gem install sphinxcrawl
11
+
12
+ ## Usage
13
+
14
+ After installation the sphinxcrawl command should be available.
15
+
16
+ Create a sphinx.conf file, for example
17
+
18
+
19
+ source page
20
+ {
21
+ type = xmlpipe2
22
+ xmlpipe_command = sphinxcrawl http://www.example.com -d 2 2>/dev/null
23
+ }
24
+
25
+ index page
26
+ {
27
+ source = page
28
+ path = sphinx/page
29
+ morphology = stem_en
30
+ charset_type = utf-8
31
+ html_strip = 1
32
+ }
33
+
34
+ Install sphinx, with debianoid linuxes this is
35
+
36
+ sudo apt-get install sphinxsearch
37
+
38
+ You should now have the indexer and search commands
39
+
40
+ indexer -c sphinx.conf page
41
+
42
+ and you can search in the index with
43
+
44
+ search -c sphinx.conf name
45
+
46
+ ## Requirements
47
+
48
+ This gem is not Google! It uses nokogiri to parse html so will fail on badly
49
+ formed html. Also more importantly it only indexes part of the page marked with
50
+ a specific html attribute. This means you can only index sites that you control.
51
+ For example
52
+
53
+ ...
54
+ <div data-field="description">
55
+ <p>This is a description</p>
56
+ </div>
57
+ ...
58
+
59
+ will index the text inside the div tag (stripping other tags) and put the text
60
+ in a sphinx field called description. All fields are aggregated from all the
61
+ pages from a site crawl.
62
+
63
+ ## Contributing
64
+
65
+ 1. Fork it
66
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
67
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
68
+ 4. Push to the branch (`git push origin my-new-feature`)
69
+ 5. Create new Pull Request
data/README.rdoc ADDED
@@ -0,0 +1,13 @@
1
+ = sphinxcrawl - html crawler and sphinx xmlstream generator
2
+
3
+ Author:: Jon Doveston (jon@doveston.me.uk)
4
+ Copyright:: Copyright (c) 2012 Jon Doveston
5
+
6
+
7
+ A command line gem that can crawl html (files or http) and generate an xml
8
+ stream for sphinx search.
9
+
10
+ == Links
11
+
12
+ * {Source on Github}[https://github.com/hatoishi/sphinxcrawl]
13
+
data/Rakefile ADDED
@@ -0,0 +1,66 @@
1
+ def dump_load_path
2
+ puts $LOAD_PATH.join("\n")
3
+ found = nil
4
+ $LOAD_PATH.each do |path|
5
+ if File.exists?(File.join(path,"rspec"))
6
+ puts "Found rspec in #{path}"
7
+ if File.exists?(File.join(path,"rspec","core"))
8
+ puts "Found core"
9
+ if File.exists?(File.join(path,"rspec","core","rake_task"))
10
+ puts "Found rake_task"
11
+ found = path
12
+ else
13
+ puts "!! no rake_task"
14
+ end
15
+ else
16
+ puts "!!! no core"
17
+ end
18
+ end
19
+ end
20
+ if found.nil?
21
+ puts "Didn't find rspec/core/rake_task anywhere"
22
+ else
23
+ puts "Found in #{path}"
24
+ end
25
+ end
26
+ require 'bundler'
27
+ require 'rake/clean'
28
+
29
+ begin
30
+ require 'rspec/core/rake_task'
31
+ rescue LoadError
32
+ dump_load_path
33
+ raise
34
+ end
35
+
36
+ require 'cucumber'
37
+ require 'cucumber/rake/task'
38
+ gem 'rdoc' # we need the installed RDoc gem, not the system one
39
+ require 'rdoc/task'
40
+
41
+ include Rake::DSL
42
+
43
+ Bundler::GemHelper.install_tasks
44
+
45
+
46
+ RSpec::Core::RakeTask.new do |t|
47
+ # Put spec opts in a file named .rspec in root
48
+ end
49
+
50
+
51
+ CUKE_RESULTS = 'results.html'
52
+ CLEAN << CUKE_RESULTS
53
+ Cucumber::Rake::Task.new(:features) do |t|
54
+ t.cucumber_opts = "features --format html -o #{CUKE_RESULTS} --format pretty --no-source -x"
55
+ t.fork = false
56
+ end
57
+
58
+ Rake::RDocTask.new do |rd|
59
+
60
+ rd.main = "README.rdoc"
61
+
62
+ rd.rdoc_files.include("README.rdoc","lib/**/*.rb","bin/**/*")
63
+ end
64
+
65
+ task :default => [:spec,:features]
66
+
data/bin/sphinxcrawl ADDED
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'methadone'
5
+ require 'sphinxcrawl'
6
+
7
+ class App
8
+ include Methadone::Main
9
+ include Methadone::CLILogging
10
+
11
+ main do |index|
12
+ index ||= 'index.html'
13
+ if index =~ /http:/
14
+ crawler = Sphinxcrawl::WebCrawler.new(index, options[:depth].to_i)
15
+ else
16
+ exit_now!(1, "file #{index} not found") unless File.exists?(index)
17
+ index = index+'/index.html' if File.directory?(index)
18
+ exit_now!(1, "file #{index} not found") unless File.exists?(index)
19
+ crawler = Sphinxcrawl::FileCrawler.new(index, options[:depth].to_i)
20
+ end
21
+ pages = crawler.pages
22
+ stream = Sphinxcrawl::Stream.new(pages)
23
+ puts stream.to_xml
24
+ end
25
+
26
+ description 'Generate sphinx xml stream from html pages'
27
+ arg :index, :optional
28
+
29
+ options['depth'] = '0'
30
+ on('-d depth', '--depth', 'Crawl depth', /^\d+$/)
31
+
32
+ version Sphinxcrawl::VERSION
33
+ use_log_level_option
34
+ go!
35
+ end
data/example.conf ADDED
@@ -0,0 +1,14 @@
1
+ source page
2
+ {
3
+ type = xmlpipe2
4
+ xmlpipe_command = bundle exec ./bin/sphinxcrawl spec/fixtures/index_file.html -d 2 2>/dev/null
5
+ }
6
+
7
+ index page
8
+ {
9
+ source = page
10
+ path = tmp/page
11
+ morphology = stem_en
12
+ charset_type = utf-8
13
+ html_strip = 1
14
+ }
@@ -0,0 +1,70 @@
1
+ Feature: Generation of sphinx xml
2
+ Scenario: help page
3
+ When I get help for "sphinxcrawl"
4
+ Then the exit status should be 0
5
+ And the banner should be present
6
+ And the banner should document that this app takes options
7
+ And the following options should be documented:
8
+ |--version|
9
+
10
+ Scenario: xml for a nonexistent file
11
+ When I run `sphinxcrawl spec/fixtures/blah.html`
12
+ Then the exit status should be 1
13
+
14
+ Scenario: xml for a nonexistent index
15
+ When I run `sphinxcrawl spec`
16
+ Then the exit status should be 1
17
+
18
+ Scenario: xml for a single file
19
+ Given a file named "test.html" with:
20
+ """
21
+ <html>
22
+ <head>
23
+ </head>
24
+ <body>
25
+ <h1 data-field="name">My name</h1>
26
+ <p data-field="description">My description</p>
27
+ <p data-field="body"><span>My</span><span>body</span></p>
28
+ </body>
29
+ </html>
30
+ """
31
+ When I run `sphinxcrawl test.html`
32
+ Then the exit status should be 0
33
+ And the output should contain "name"
34
+ And the output should contain "description"
35
+ And the output should contain "body"
36
+
37
+ Scenario: index the xml
38
+ Given a file named "test.html" with:
39
+ """
40
+ <html>
41
+ <head>
42
+ </head>
43
+ <body>
44
+ <h1 data-field="name">My name</h1>
45
+ <p data-field="description">My description</p>
46
+ <p data-field="body"><span>My</span><span>body</span></p>
47
+ </body>
48
+ </html>
49
+ """
50
+ And a file named "test.conf" with:
51
+ """
52
+ indexer
53
+ {
54
+ }
55
+ source page
56
+ {
57
+ type = xmlpipe2
58
+ xmlpipe_command = ../../bin/sphinxcrawl test.html 2>/dev/null
59
+ }
60
+ index page
61
+ {
62
+ source = page
63
+ path = page
64
+ morphology = stem_en
65
+ charset_type = utf-8
66
+ html_strip = 1
67
+ }
68
+ """
69
+ When I run `indexer -c test.conf page`
70
+ Then the exit status should be 0
@@ -0,0 +1 @@
1
+ # Put your step definitions here
@@ -0,0 +1,16 @@
1
+ require 'aruba/cucumber'
2
+ require 'methadone/cucumber'
3
+
4
+ ENV['PATH'] = "#{File.expand_path(File.dirname(__FILE__) + '/../../bin')}#{File::PATH_SEPARATOR}#{ENV['PATH']}"
5
+ LIB_DIR = File.join(File.expand_path(File.dirname(__FILE__)),'..','..','lib')
6
+
7
+ Before do
8
+ # Using "announce" causes massive warnings on 1.9.2
9
+ @puts = true
10
+ @original_rubylib = ENV['RUBYLIB']
11
+ ENV['RUBYLIB'] = LIB_DIR + File::PATH_SEPARATOR + ENV['RUBYLIB'].to_s
12
+ end
13
+
14
+ After do
15
+ ENV['RUBYLIB'] = @original_rubylib
16
+ end
@@ -0,0 +1,9 @@
1
+ require 'sphinxcrawl/version'
2
+ require 'sphinxcrawl/page'
3
+ require 'sphinxcrawl/stream'
4
+ require 'sphinxcrawl/crawler'
5
+ require 'sphinxcrawl/file_crawler'
6
+ require 'sphinxcrawl/web_crawler'
7
+
8
+ module Sphinxcrawl
9
+ end
@@ -0,0 +1,36 @@
1
+ require 'set'
2
+
3
+ module Sphinxcrawl
4
+ class Crawler
5
+ attr_reader :depth
6
+
7
+ def initialize(depth=0)
8
+ @depth = depth
9
+ end
10
+
11
+ def pages
12
+ return @pages if @pages
13
+ return [] unless index
14
+ @pages = Set.new([index])
15
+ return @pages if @depth == 0
16
+
17
+ current_pages = Set.new([index])
18
+ depth.times do
19
+ links = current_pages.map(&:links).flatten.compact.uniq
20
+ current_pages = Set.new(links.map{ |url| get_page(url) }.compact) - @pages
21
+ @pages += current_pages
22
+ end
23
+ @pages
24
+ end
25
+
26
+ private
27
+
28
+ def index
29
+ nil
30
+ end
31
+
32
+ def get_page(url)
33
+ nil
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,20 @@
1
+ module Sphinxcrawl
2
+ class FileCrawler < Crawler
3
+ def initialize(index_file_name, depth=0)
4
+ @directory = File.dirname(index_file_name)
5
+ @basename = File.basename(index_file_name)
6
+ super(depth)
7
+ end
8
+
9
+ private
10
+
11
+ def index
12
+ @index ||= get_page(@basename)
13
+ end
14
+
15
+ def get_page(url)
16
+ html = File.read(@directory + '/' + url) rescue nil
17
+ Page.new(url, html) if html
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,58 @@
1
+ require 'nokogiri'
2
+
3
+ module Sphinxcrawl
4
+ class Page
5
+ attr_reader :html, :url
6
+
7
+ def initialize(url, html)
8
+ @url = url
9
+ @html = html
10
+ @field_data = {}
11
+ end
12
+
13
+ def empty?
14
+ !html || html.empty?
15
+ end
16
+
17
+ def field_names
18
+ fields.map do |n|
19
+ n.attribute('data-field').value
20
+ end.uniq
21
+ end
22
+
23
+ def field(name)
24
+ @field_data[name] ||= get_field(name)
25
+ end
26
+
27
+ def links
28
+ @links ||= document.xpath('//a/@href').map(&:value).map do |url|
29
+ # only local links (no http://)
30
+ uri = URI.parse(url)
31
+ uri.host.nil? && uri.scheme.nil? ? url : nil
32
+ end.compact
33
+ end
34
+
35
+ def eql?(compare)
36
+ url == compare.url
37
+ end
38
+ alias :== :eql?
39
+
40
+ def hash
41
+ url.hash
42
+ end
43
+
44
+ private
45
+
46
+ def get_field(name)
47
+ document.xpath("//*[@data-field='#{name}']//text()").map(&:content).join(' ')
48
+ end
49
+
50
+ def fields
51
+ @fields ||= document.xpath('//*[@data-field]')
52
+ end
53
+
54
+ def document
55
+ @document ||= Nokogiri::HTML(html)
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,85 @@
1
+ require 'rexml/document'
2
+
3
+ module Sphinxcrawl
4
+ class Stream
5
+ include REXML
6
+
7
+ attr_reader :pages
8
+
9
+ def initialize(pages)
10
+ @pages = Array(pages)
11
+ @count = 0
12
+ end
13
+
14
+ def empty?
15
+ pages.length == 0
16
+ end
17
+
18
+ def number_of_pages
19
+ pages.length
20
+ end
21
+
22
+ def field_names
23
+ @field_names ||= pages.map(&:field_names).flatten.uniq
24
+ end
25
+
26
+ def to_xml
27
+ add_fields
28
+ add_documents
29
+ ''.tap { |xml| document.write(xml) }
30
+ end
31
+
32
+ private
33
+
34
+ def next_id
35
+ @count += 1
36
+ end
37
+
38
+ def add_fields
39
+ field_names.each do |name|
40
+ add_field(name)
41
+ end
42
+ end
43
+
44
+ def add_documents
45
+ pages.each do |page|
46
+ add_document(page)
47
+ end
48
+ end
49
+
50
+ def add_field(name)
51
+ schema.add_element('sphinx:field').tap do |field|
52
+ field.add_attribute('name', name)
53
+ end
54
+ end
55
+
56
+ def add_document(page)
57
+ if page.field_names.length > 0
58
+ root.add_element('sphinx:document').tap do |doc|
59
+ doc.add_attribute('id', next_id)
60
+ doc.add_element('url').text = CData.new(page.url)
61
+ field_names.each do |name|
62
+ doc.add_element(name).text = CData.new(page.field(name))
63
+ end
64
+ end
65
+ end
66
+ end
67
+
68
+ def document
69
+ @document ||= Document.new.tap do |doc|
70
+ doc << XMLDecl.new('1.0', 'UTF-8')
71
+ @docset = doc.add_element('sphinx:docset')
72
+ end
73
+ end
74
+
75
+ def root
76
+ document.root
77
+ end
78
+
79
+ def schema
80
+ @schema ||= root.add_element('sphinx:schema').tap do |sch|
81
+ sch.add_element('sphinx:attr').add_attributes('name' => 'url', 'type' => 'string')
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,3 @@
1
+ module Sphinxcrawl
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,43 @@
1
+ require 'uri'
2
+ require 'net/http'
3
+
4
+ module Sphinxcrawl
5
+ class WebCrawler < Crawler
6
+ def initialize(url, depth=0)
7
+ @uri = URI.parse(url)
8
+ super(depth)
9
+ end
10
+
11
+ private
12
+
13
+ def host
14
+ @uri.host
15
+ end
16
+
17
+ def port
18
+ @uri.port
19
+ end
20
+
21
+ def index_path
22
+ @uri.path
23
+ end
24
+
25
+ def http
26
+ Net::HTTP.new(host, port)
27
+ end
28
+
29
+ def index
30
+ @index ||= get_page(index_path)
31
+ end
32
+
33
+ def get_page(path)
34
+ path = path && !path.empty? ? path : '/'
35
+ location = path
36
+ begin
37
+ response = http.request(Net::HTTP::Get.new(location))
38
+ end while location = response.header['location']
39
+ html = response.body
40
+ Page.new(path, html) if html
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,11 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <h1 data-field="name">My name is child</h1>
6
+ <p data-field="description">My description is child</p>
7
+ <p data-field="body"><span>My</span><span>body</span>is child</p>
8
+ <a href="index_file.html">index</a>
9
+ <a href="tree_file.html">tree</a>
10
+ </body>
11
+ </html>
@@ -0,0 +1,11 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <h1 data-field="name">My name is child</h1>
6
+ <p data-field="description">My description is child</p>
7
+ <p data-field="body"><span>My</span><span>body</span>is child</p>
8
+ <a href="/index_web.html">index</a>
9
+ <a href="/tree_web.html">tree</a>
10
+ </body>
11
+ </html>
@@ -0,0 +1,12 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <h1 data-field="name">My name</h1>
6
+ <p data-field="description">My description</p>
7
+ <p data-field="body"><span>My</span><span>body</span></p>
8
+ <a href="tree_file.html">tree</a>
9
+ <a href="index_file.html">index</a>
10
+ <a href="http://www.google.com">google</a>
11
+ </body>
12
+ </html>
@@ -0,0 +1,11 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <h1 data-field="name">My name</h1>
6
+ <p data-field="description">My description</p>
7
+ <p data-field="body"><span>My</span><span>body</span></p>
8
+ <a href="/tree_web.html">tree</a>
9
+ <a href="/index_web.html">index</a>
10
+ </body>
11
+ </html>
@@ -0,0 +1,12 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <h1 data-field="name">My name is tree</h1>
6
+ <p data-field="description">My description is tree</p>
7
+ <p data-field="body"><span>My</span><span>body</span>is tree</p>
8
+ <a href="index_file.html">index</a>
9
+ <a href="child_file.html">child</a>
10
+ <a href="tree_file.html">tree</a>
11
+ </body>
12
+ </html>
@@ -0,0 +1,12 @@
1
+ <html>
2
+ <head>
3
+ </head>
4
+ <body>
5
+ <h1 data-field="name">My name is tree</h1>
6
+ <p data-field="description">My description is tree</p>
7
+ <p data-field="body"><span>My</span><span>body</span>is tree</p>
8
+ <a href="/index_web.html">index</a>
9
+ <a href="/child_web.html">child</a>
10
+ <a href="/tree_web.html">tree</a>
11
+ </body>
12
+ </html>
@@ -0,0 +1,2 @@
1
+ require 'sphinxcrawl'
2
+ require 'webmock/rspec'
@@ -0,0 +1,23 @@
1
+ require 'spec_helper'
2
+
3
+ describe Sphinxcrawl::FileCrawler do
4
+ let(:dir) { 'spec/fixtures' }
5
+ let(:index) { 'index_file.html' }
6
+ let(:html) { dir + '/' + index }
7
+ subject { Sphinxcrawl::FileCrawler.new(html) }
8
+
9
+ its(:depth) { should be == 0 }
10
+ specify { subject.pages.length.should be == 1 }
11
+
12
+ context "with a depth of 1" do
13
+ subject { Sphinxcrawl::FileCrawler.new(html, 1) }
14
+ its(:depth) { should be == 1 }
15
+ specify { subject.pages.length.should be == 2 }
16
+ end
17
+
18
+ context "with a depth of 2" do
19
+ subject { Sphinxcrawl::FileCrawler.new(html, 2) }
20
+ its(:depth) { should be == 2 }
21
+ specify { subject.pages.length.should be == 3 }
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ require 'spec_helper'
2
+
3
+ describe Sphinxcrawl::Page do
4
+ let(:html) { File.read('spec/fixtures/index_file.html') }
5
+ let(:same) { Sphinxcrawl::Page.new(subject.url, 'blah') }
6
+ subject { Sphinxcrawl::Page.new('index_file.html', html) }
7
+
8
+ its(:html) { should be == html }
9
+ it { should_not be_empty }
10
+
11
+ its(:field_names) { should be == %w[name description body] }
12
+ specify { subject.field('name').should be == 'My name' }
13
+ specify { subject.field('description').should be == 'My description' }
14
+ specify { subject.field('body').should be == 'My body' }
15
+
16
+ its(:links) { should be == ['tree_file.html', 'index_file.html'] }
17
+
18
+ specify { subject.should be == same }
19
+ specify { subject.should eq(same) }
20
+ specify { subject.should eql(same) }
21
+ specify { subject.should_not equal same }
22
+ specify { [subject, same].uniq.length.should be == 1 }
23
+ end
@@ -0,0 +1,20 @@
1
+ require 'spec_helper'
2
+
3
+ describe Sphinxcrawl::Stream do
4
+ let(:html) { File.read('spec/fixtures/index_file.html') }
5
+ let(:page) { Sphinxcrawl::Page.new('index_file.html', html) }
6
+ subject { Sphinxcrawl::Stream.new(page) }
7
+ let(:xml) { subject.to_xml }
8
+
9
+ it { should_not be_empty }
10
+ its(:number_of_pages) { should be == 1 }
11
+
12
+ its(:field_names) { should be == %w[name description body] }
13
+
14
+ its(:to_xml) { should include('name') }
15
+ its(:to_xml) { should include('description') }
16
+ its(:to_xml) { should include('body') }
17
+ its(:to_xml) { should include('My name') }
18
+ its(:to_xml) { should include('My description') }
19
+ its(:to_xml) { should include('My body') }
20
+ end
@@ -0,0 +1,32 @@
1
+ require 'spec_helper'
2
+
3
+ describe Sphinxcrawl::WebCrawler do
4
+ let(:dir) { 'spec/fixtures' }
5
+ let(:domain) { 'http://www.example.com' }
6
+ let(:index) { 'index_web.html' }
7
+ let(:tree) { 'tree_web.html' }
8
+ let(:child) { 'child_web.html' }
9
+
10
+ before do
11
+ stub_request(:get, domain+'/'+index).to_return(:body => File.new(dir+'/'+index), :status => 200)
12
+ stub_request(:get, domain+'/'+tree).to_return(:body => File.new(dir+'/'+tree), :status => 200)
13
+ stub_request(:get, domain+'/'+child).to_return(:body => File.new(dir+'/'+child), :status => 200)
14
+ end
15
+
16
+ subject { Sphinxcrawl::WebCrawler.new(domain+'/'+index) }
17
+
18
+ its(:depth) { should be == 0 }
19
+ specify { subject.pages.length.should be == 1 }
20
+
21
+ context "with a depth of 1" do
22
+ subject { Sphinxcrawl::WebCrawler.new(domain+'/'+index, 1) }
23
+ its(:depth) { should be == 1 }
24
+ specify { subject.pages.length.should be == 2 }
25
+ end
26
+
27
+ context "with a depth of 2" do
28
+ subject { Sphinxcrawl::WebCrawler.new(domain+'/'+index, 2) }
29
+ its(:depth) { should be == 2 }
30
+ specify { subject.pages.length.should be == 3 }
31
+ end
32
+ end
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'sphinxcrawl/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = 'sphinxcrawl'
8
+ gem.version = Sphinxcrawl::VERSION
9
+ gem.authors = ['Jon Doveston']
10
+ gem.email = ['jon@doveston.me.uk']
11
+ gem.description = %q{Simple command to crawl a site and process html into a sphinx xmlstream}
12
+ gem.summary = %q{Simple command to crawl a site and process html into s sphinx xmlstream}
13
+ gem.homepage = 'https://github.com/hatoishi/sphinxcrawl'
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ['lib']
19
+ gem.add_dependency('methadone', '~> 1.2.2')
20
+ gem.add_dependency('nokogiri', '~> 1.5.5')
21
+
22
+ gem.add_development_dependency('rdoc')
23
+ gem.add_development_dependency('aruba')
24
+ gem.add_development_dependency('rake')
25
+ gem.add_development_dependency('webmock')
26
+ end
metadata ADDED
@@ -0,0 +1,194 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sphinxcrawl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jon Doveston
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-11-03 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: methadone
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 1.2.2
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 1.2.2
30
+ - !ruby/object:Gem::Dependency
31
+ name: nokogiri
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 1.5.5
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 1.5.5
46
+ - !ruby/object:Gem::Dependency
47
+ name: rdoc
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: aruba
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: rake
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: webmock
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ description: Simple command to crawl a site and process html into a sphinx xmlstream
111
+ email:
112
+ - jon@doveston.me.uk
113
+ executables:
114
+ - sphinxcrawl
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - .gitignore
119
+ - Gemfile
120
+ - Gemfile.lock
121
+ - Guardfile
122
+ - LICENSE.txt
123
+ - README.markdown
124
+ - README.rdoc
125
+ - Rakefile
126
+ - bin/sphinxcrawl
127
+ - example.conf
128
+ - features/sphinxcrawl.feature
129
+ - features/step_definitions/sphinxcrawl_steps.rb
130
+ - features/support/env.rb
131
+ - lib/sphinxcrawl.rb
132
+ - lib/sphinxcrawl/crawler.rb
133
+ - lib/sphinxcrawl/file_crawler.rb
134
+ - lib/sphinxcrawl/page.rb
135
+ - lib/sphinxcrawl/stream.rb
136
+ - lib/sphinxcrawl/version.rb
137
+ - lib/sphinxcrawl/web_crawler.rb
138
+ - spec/fixtures/child_file.html
139
+ - spec/fixtures/child_web.html
140
+ - spec/fixtures/index_file.html
141
+ - spec/fixtures/index_web.html
142
+ - spec/fixtures/tree_file.html
143
+ - spec/fixtures/tree_web.html
144
+ - spec/spec_helper.rb
145
+ - spec/sphinxcrawl/file_crawler_spec.rb
146
+ - spec/sphinxcrawl/page_spec.rb
147
+ - spec/sphinxcrawl/stream_spec.rb
148
+ - spec/sphinxcrawl/web_crawler_spec.rb
149
+ - sphinxcrawl.gemspec
150
+ homepage: https://github.com/hatoishi/sphinxcrawl
151
+ licenses: []
152
+ post_install_message:
153
+ rdoc_options: []
154
+ require_paths:
155
+ - lib
156
+ required_ruby_version: !ruby/object:Gem::Requirement
157
+ none: false
158
+ requirements:
159
+ - - ! '>='
160
+ - !ruby/object:Gem::Version
161
+ version: '0'
162
+ segments:
163
+ - 0
164
+ hash: 3810366116509016959
165
+ required_rubygems_version: !ruby/object:Gem::Requirement
166
+ none: false
167
+ requirements:
168
+ - - ! '>='
169
+ - !ruby/object:Gem::Version
170
+ version: '0'
171
+ segments:
172
+ - 0
173
+ hash: 3810366116509016959
174
+ requirements: []
175
+ rubyforge_project:
176
+ rubygems_version: 1.8.24
177
+ signing_key:
178
+ specification_version: 3
179
+ summary: Simple command to crawl a site and process html into s sphinx xmlstream
180
+ test_files:
181
+ - features/sphinxcrawl.feature
182
+ - features/step_definitions/sphinxcrawl_steps.rb
183
+ - features/support/env.rb
184
+ - spec/fixtures/child_file.html
185
+ - spec/fixtures/child_web.html
186
+ - spec/fixtures/index_file.html
187
+ - spec/fixtures/index_web.html
188
+ - spec/fixtures/tree_file.html
189
+ - spec/fixtures/tree_web.html
190
+ - spec/spec_helper.rb
191
+ - spec/sphinxcrawl/file_crawler_spec.rb
192
+ - spec/sphinxcrawl/page_spec.rb
193
+ - spec/sphinxcrawl/stream_spec.rb
194
+ - spec/sphinxcrawl/web_crawler_spec.rb