web_crawler 0.3.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +2 -0
- data/README +22 -1
- data/lib/web_crawler.rb +2 -0
- data/lib/web_crawler/application.rb +33 -2
- data/lib/web_crawler/base.rb +113 -0
- data/lib/web_crawler/batch_request.rb +10 -4
- data/lib/web_crawler/cached_request.rb +16 -7
- data/lib/web_crawler/configuration.rb +5 -5
- data/lib/web_crawler/factory_url.rb +27 -7
- data/lib/web_crawler/follower.rb +11 -9
- data/lib/web_crawler/parsers.rb +1 -0
- data/lib/web_crawler/parsers/mapper.rb +114 -0
- data/lib/web_crawler/parsers/url.rb +3 -5
- data/lib/web_crawler/request.rb +14 -2
- data/lib/web_crawler/response.rb +2 -2
- data/lib/web_crawler/version.rb +2 -2
- data/lib/web_crawler/view.rb +1 -1
- data/lib/web_crawler/view/csv.rb +1 -1
- data/lib/web_crawler/view/json.rb +1 -1
- data/lib/web_crawler/view/yaml.rb +1 -1
- data/spec/fixtures/example.xml +171 -0
- data/spec/fixtures/my_crawler.rb +82 -0
- data/spec/fixtures/test_crawler.rb +108 -0
- data/spec/fixtures/test_crawler2.rb +77 -0
- data/spec/spec_helper.rb +8 -3
- data/spec/web_crawler/batch_request_spec.rb +0 -11
- data/spec/web_crawler/cached_request_spec.rb +17 -11
- data/spec/web_crawler/factory_url_spec.rb +19 -6
- data/spec/web_crawler/follow_spec.rb +11 -4
- data/spec/web_crawler/view_spec.rb +10 -10
- data/spec/web_crawler/web_crawler_api_base_class_spec.rb +143 -0
- data/web_crawler.gemspec +2 -0
- metadata +43 -8
@@ -0,0 +1,108 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
class TestCrawler < WebCrawler::Base
|
4
|
+
target 'http://45.ru/job/vacancy/2.php' do |targets|
|
5
|
+
follow targets, :only => /\/job\/vacancy\/\d+\.html/
|
6
|
+
end
|
7
|
+
|
8
|
+
cache_to '/tmp/wcrawler_cache'
|
9
|
+
|
10
|
+
log_to nil
|
11
|
+
|
12
|
+
context '#block_center > table:first', :vacancies do |table|
|
13
|
+
table.search('td.bg_color2').map do |key|
|
14
|
+
[key.inner_text.strip, key.search('~').inner_text.strip]
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
protected
|
19
|
+
|
20
|
+
class <<self
|
21
|
+
|
22
|
+
def process(*)
|
23
|
+
normalize_data(super)
|
24
|
+
end
|
25
|
+
|
26
|
+
def normalize_data(data)
|
27
|
+
data[:vacancies].map do |vacancy|
|
28
|
+
Hash[vacancy.map { |key, value| [translate_key(key), normalize_value(value)] }]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def translate_key(key)
|
33
|
+
{ "Город" => :city_name,
|
34
|
+
"Фирма" => :company_name,
|
35
|
+
"График работы" => :schedule,
|
36
|
+
"Тип работы" => :employment_type,
|
37
|
+
"Зарплата, руб." => :profit,
|
38
|
+
"Должность" => :name,
|
39
|
+
"Условия" => :conditions,
|
40
|
+
"Требования" => :requirements,
|
41
|
+
"Обязанности" => :responsibilities,
|
42
|
+
"О компании" => :company_description,
|
43
|
+
"Знание языков" => :known_languages,
|
44
|
+
"Знание компьютера" => :known_computer,
|
45
|
+
"Образование" => :education,
|
46
|
+
"Место работы (район)" => :place_of_work,
|
47
|
+
"Бизнес-образование" => :business_education,
|
48
|
+
"Телефон" => :phone,
|
49
|
+
"Контактное лицо" => :contact_name,
|
50
|
+
"E-mail" => :email,
|
51
|
+
"Адрес" => :address,
|
52
|
+
"Стаж" => :experience,
|
53
|
+
"http://" => :site,
|
54
|
+
"url" => :url }[key]
|
55
|
+
end
|
56
|
+
|
57
|
+
def normalize_value(value)
|
58
|
+
value.gsub(/\t+/,' ')
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
__END__
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
Город Курган
|
70
|
+
Фирма Розничная сеть Л`Этуаль
|
71
|
+
Должность Продавец-консультант ЛЭтуаль
|
72
|
+
Зарплата, руб. 20 000
|
73
|
+
Форма оплаты Оклад+%
|
74
|
+
График работы Полный рабочий день
|
75
|
+
Тип работы Постоянная
|
76
|
+
Условия Условия работы:
|
77
|
+
• Корпоративное обучение
|
78
|
+
• График работы сменный;
|
79
|
+
• Оформление по Трудовому Кодексу РФ, соц. Пакет
|
80
|
+
• Конкурентная заработная плата, оклад + %
|
81
|
+
|
82
|
+
В компании разработана уникальная программа карьерного роста для наших сотрудников!
|
83
|
+
Требования Требования:
|
84
|
+
• Образование средне-специальное, высшее
|
85
|
+
• Опыт продаж от 1 года,
|
86
|
+
• Возраст от 23 до 35 лет, приятный внешний вид
|
87
|
+
• Умение находить контакт с любым покупателем
|
88
|
+
• Готовность изменить свой имидж в соответствии с корпоративными требованиями компании
|
89
|
+
• Высокая работоспособность, активная жизненная позиция
|
90
|
+
• Стрессоустойчивость, хорошая память, желание работать и развиваться
|
91
|
+
Обязанности Обязанности:
|
92
|
+
• Грамотное консультирование клиента по ассортименту
|
93
|
+
• Продажа косметики и парфюмерии
|
94
|
+
• Работа с кассой
|
95
|
+
• Мерчендайзинг
|
96
|
+
• Соблюдение и поддержание стандартов компании
|
97
|
+
• Поддержание чистоты рабочего места
|
98
|
+
О компании Продавец-консультант - это лицо нашей компании. От качества работы, выполняемой им, зависит общий успех – его и компании. Мы декларируем как основные преимущества нашей компании – уникальный дизайн, богатый ассортимент продукции, гибкую систему скидок и специальные предложения, а самое главное – грамотность, профессионализм и вежливость продавцов-консультантов наших магазинов!
|
99
|
+
|
100
|
+
ОБЯЗАТЕЛЬНО УКАЗЫВАЙТЕ в теме письма "Продавец-консультант г Курган"
|
101
|
+
Образование Среднее специальное
|
102
|
+
Стаж 1
|
103
|
+
Степень ограничения трудоспособности Отсутствует
|
104
|
+
Телефон 8-982-602-8331
|
105
|
+
Контактное лицо Татьяна Александровна
|
106
|
+
E-mail hrm1-svx-ur@letuin.ru
|
107
|
+
http:// www.letoile.ru%2F
|
108
|
+
Документы для скачивания
|
@@ -0,0 +1,77 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
|
3
|
+
class TestCrawler2 < WebCrawler::Base
|
4
|
+
|
5
|
+
target "http://www.superjob.ru/export/vacs_to_xml.php"
|
6
|
+
|
7
|
+
log_to "/tmp/file.log" # or Logger.new(...)
|
8
|
+
|
9
|
+
cache_to '/tmp/wcrawler/cache' # or (CacheClass < CacheAdapter).new *args
|
10
|
+
|
11
|
+
context "job", :jobs do
|
12
|
+
|
13
|
+
map 'link', :to => :source_link, :on => :inner_text # default :on => :inner_text
|
14
|
+
map 'name', :to => :name
|
15
|
+
map 'region', :to => :city_name
|
16
|
+
map 'salary', :to => :profit
|
17
|
+
map 'description', :to => :description, :filter => :format_description
|
18
|
+
map 'contacts', :to => :contact_text
|
19
|
+
map 'company', :to => :company, :on => [:attr, :id]
|
20
|
+
map 'published', :to => :published_at
|
21
|
+
map 'expire', :to => :expire_at
|
22
|
+
map 'catalog item', :to => :specialization_ids, :on => nil, :filter => :convert_specs
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
protected
|
27
|
+
|
28
|
+
def self.format_description(text)
|
29
|
+
@titles ||= ["Условия работы и компенсации:\n",
|
30
|
+
"Место работы:\n",
|
31
|
+
"Должностные обязанности:\n",
|
32
|
+
"Требования к квалификации:\n"]
|
33
|
+
|
34
|
+
text.each_line.inject("") { |new_text, line| new_text << (@titles.include?(line) ? "<h4>#{line.chomp}</h4>\n" : line) }
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.convert_specs(specs)
|
38
|
+
@ids_mapping ||= {
|
39
|
+
911 => 4537,
|
40
|
+
1 => 4274,
|
41
|
+
5 => 4335,
|
42
|
+
6 => 4408,
|
43
|
+
16 => [4756, 4545],
|
44
|
+
3 => 4488,
|
45
|
+
9 => 4303,
|
46
|
+
8 => 4649,
|
47
|
+
547 => 4237,
|
48
|
+
579 => 4237,
|
49
|
+
1104 => 4671,
|
50
|
+
10 => 4588,
|
51
|
+
814 => 4568,
|
52
|
+
2 => 4714,
|
53
|
+
11 => 4671,
|
54
|
+
13 => 4691,
|
55
|
+
15 => 4649,
|
56
|
+
17 => 4504,
|
57
|
+
601 => 4428,
|
58
|
+
45 => 4632,
|
59
|
+
22 => 4473,
|
60
|
+
515 => 4524,
|
61
|
+
19 => 4473,
|
62
|
+
20 => 4524,
|
63
|
+
398 => 4749,
|
64
|
+
503 => 4775,
|
65
|
+
941 => 4742,
|
66
|
+
1434 => 4802,
|
67
|
+
2109 => 4537
|
68
|
+
}
|
69
|
+
specs.map { |i| @ids_mapping[i['thread'].to_i] }.to_a.flatten
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
#MyCrawler.run # => return Array
|
76
|
+
#MyCrawler.run(:json) # => return String like a JSON object
|
77
|
+
#MyCrawler.run(:yaml) # => return String of YAML format
|
data/spec/spec_helper.rb
CHANGED
@@ -9,9 +9,14 @@ require 'fake_web_generator'
|
|
9
9
|
RSpec.configure do |c|
|
10
10
|
c.mock_with :rspec
|
11
11
|
c.include FakeWebGenerator
|
12
|
-
end
|
13
12
|
|
14
|
-
|
15
|
-
|
13
|
+
c.before(:each) do
|
14
|
+
WebCrawler.configure do
|
15
|
+
config.logger = nil
|
16
|
+
config.cache_adapter = WebCrawler::CacheAdapter::Memory.new
|
17
|
+
config.logger.level = Logger::ERROR
|
18
|
+
end
|
19
|
+
end
|
16
20
|
end
|
17
21
|
|
22
|
+
|
@@ -27,19 +27,8 @@ describe WebCrawler::BatchRequest do
|
|
27
27
|
end
|
28
28
|
|
29
29
|
it "should process requests" do
|
30
|
-
subject.requests.map { |r| r.should_receive(:process).with(no_args).and_return(responses.first) }
|
31
30
|
subject.process.should be_a Array
|
32
31
|
subject.process.first.should be_a WebCrawler::Response
|
33
32
|
end
|
34
33
|
|
35
|
-
it "should accept :parser option with parser class or object" do
|
36
|
-
class ::TestParser
|
37
|
-
def parse(resp)
|
38
|
-
resp.to_s + ' parsed'
|
39
|
-
end
|
40
|
-
end
|
41
|
-
described_class.new(urls, parser: TestParser.new).process.should == ["Example body parsed",
|
42
|
-
"Example body1 parsed",
|
43
|
-
"Example body for url http://example.com/2 parsed"]
|
44
|
-
end
|
45
34
|
end
|
@@ -11,21 +11,27 @@ describe 'Cached requests' do
|
|
11
11
|
let(:urls) { ['example.com/1', 'example.com/2', 'example.com'] }
|
12
12
|
|
13
13
|
it 'should not send requests to the web if cache exists' do
|
14
|
-
FakeWeb.register_uri(:get, "http://example.com/
|
15
|
-
first_response = FakeWeb.response_for :get, "http://example.com/
|
14
|
+
FakeWeb.register_uri(:get, "http://example.com/cached", :body => "cached Example body1")
|
15
|
+
first_response = FakeWeb.response_for :get, "http://example.com/cached"
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
lambda {
|
20
|
-
WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process
|
21
|
-
}.should raise_error(ArgumentError, /response must be a Net::HTTPResponse/)
|
17
|
+
WebCrawler::BatchRequest.new("http://example.com/cached").process
|
18
|
+
WebCrawler::config.cache.adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/cached"), first_response))
|
22
19
|
|
20
|
+
cached_response = WebCrawler::config.cache.adapter.get("http://example.com/cached")
|
23
21
|
FakeWeb.should_not_receive(:response_for)
|
24
22
|
|
25
|
-
WebCrawler::
|
26
|
-
|
27
|
-
cached_response = WebCrawler::config.cache_adapter.get("http://example.com/1")
|
28
|
-
WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process.first.should be cached_response
|
23
|
+
WebCrawler::BatchRequest.new("http://example.com/cached").process.first.should be cached_response
|
29
24
|
end
|
30
25
|
|
26
|
+
it 'should not be cached' do
|
27
|
+
FakeWeb.register_uri(:get, "http://example.com/cached", :body => "cached Example body1")
|
28
|
+
first_response = FakeWeb.response_for :get, "http://example.com/cached"
|
29
|
+
|
30
|
+
WebCrawler::BatchRequest.new("http://example.com/cached").process
|
31
|
+
WebCrawler::config.cache.adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/cached"), first_response))
|
32
|
+
|
33
|
+
cached_response = WebCrawler::config.cache.adapter.get("http://example.com/cached")
|
34
|
+
|
35
|
+
WebCrawler::BatchRequest.new("http://example.com/cached", no_cached: true).process.first.should_not be cached_response
|
36
|
+
end
|
31
37
|
end
|
@@ -3,14 +3,14 @@ require "spec_helper"
|
|
3
3
|
describe WebCrawler::FactoryUrl do
|
4
4
|
|
5
5
|
it "should generate urls with block" do
|
6
|
-
first_param
|
6
|
+
first_param = [1, 2, 3]
|
7
7
|
second_param = 10...15
|
8
8
|
|
9
9
|
factory = WebCrawler::FactoryUrl.new(first_param, second_param) do |*args|
|
10
10
|
random = rand(3000)
|
11
11
|
"www.example.com/%s/%s.html?rid=#{random}" % args
|
12
12
|
end
|
13
|
-
urls
|
13
|
+
urls = factory.factory
|
14
14
|
|
15
15
|
urls.should be_a Array
|
16
16
|
factory.params.size.should == 15
|
@@ -19,16 +19,29 @@ describe WebCrawler::FactoryUrl do
|
|
19
19
|
end
|
20
20
|
|
21
21
|
it "should generate urls with pattern" do
|
22
|
-
first_param
|
22
|
+
first_param = [1, 2, 3]
|
23
23
|
second_param = 10...15
|
24
24
|
|
25
25
|
factory = WebCrawler::FactoryUrl.new("www.example.com/$1/$2.html", first_param, second_param)
|
26
|
-
urls
|
26
|
+
urls = factory.factory
|
27
27
|
|
28
28
|
urls.should be_a Array
|
29
|
-
factory.params.
|
29
|
+
factory.params.should have(15).items
|
30
30
|
urls.should have(factory.params.size).urls
|
31
31
|
urls.first.should == "www.example.com/1/10.html"
|
32
32
|
end
|
33
|
-
|
33
|
+
|
34
|
+
it "should generate urls with pattern and hash options" do
|
35
|
+
pattern = "www.example.com/category_:category/page:page/"
|
36
|
+
options = { :page => 1..3, :category => [1, 2, 3, 4] }
|
37
|
+
|
38
|
+
factory = WebCrawler::FactoryUrl.new(pattern, options)
|
39
|
+
urls = factory.factory
|
40
|
+
|
41
|
+
urls.should be_a Array
|
42
|
+
factory.params.should have(12).items
|
43
|
+
urls.should have(factory.params.size).urls
|
44
|
+
urls.first.should == "www.example.com/category_1/page1/"
|
45
|
+
end
|
46
|
+
|
34
47
|
end
|
@@ -7,16 +7,23 @@ describe WebCrawler::Follower do
|
|
7
7
|
responses = WebCrawler::BatchRequest.new(urls_board_path).process
|
8
8
|
urls = WebCrawler::Follower.new(responses).collect
|
9
9
|
|
10
|
-
urls.
|
11
|
-
urls.
|
10
|
+
urls.should have(9).urls
|
11
|
+
urls.should == known_urls
|
12
12
|
end
|
13
13
|
|
14
14
|
it "should collect all the unique url with same host like in responses" do
|
15
15
|
responses = WebCrawler::BatchRequest.new(urls_board_path).process
|
16
16
|
urls = WebCrawler::Follower.new(responses, same_host: true).collect
|
17
17
|
|
18
|
-
urls.
|
19
|
-
urls.
|
18
|
+
urls.should have(6).urls
|
19
|
+
urls.should == known_urls.reject { |u| u =~ /otherhost/ }
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should collect all the unique url like a given regexp" do
|
23
|
+
responses = WebCrawler::BatchRequest.new(urls_board_path).process
|
24
|
+
urls = WebCrawler::Follower.new(responses, only: /\/\d+\.html/).collect
|
25
|
+
urls.should have(2).urls
|
26
|
+
urls.should == known_urls.select { |u| u =~ /\/\d+\.html/ }
|
20
27
|
end
|
21
28
|
|
22
29
|
it "should process requests for following urls" do
|
@@ -15,8 +15,8 @@ describe WebCrawler::View::Csv do
|
|
15
15
|
end
|
16
16
|
|
17
17
|
it "should render input array to csv string with options" do
|
18
|
-
described_class.new(input, headers: [:title, :url, :author], col_sep: ";").render.should == "title;url;author\n1;2;3\nstring;\"other string\n\"\n"
|
19
|
-
described_class.new(input, headers: [:title, :url, :author], row_sep: "\n\n").render.should == "title,url,author\n\n1,2,3\n\nstring,\"other string\n\"\n\n"
|
18
|
+
described_class.new(input, headers: [:title, :url, :author], csv: {col_sep: ";"}).render.should == "title;url;author\n1;2;3\nstring;\"other string\n\"\n"
|
19
|
+
described_class.new(input, headers: [:title, :url, :author], csv: {row_sep: "\n\n"}).render.should == "title,url,author\n\n1,2,3\n\nstring,\"other string\n\"\n\n"
|
20
20
|
end
|
21
21
|
|
22
22
|
end
|
@@ -27,15 +27,15 @@ describe WebCrawler::View::Json do
|
|
27
27
|
let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n", :author=>nil }] }
|
28
28
|
|
29
29
|
it "should render input array to json string" do
|
30
|
-
described_class.new(input, headers: [:title, :url, :author]).render.should == '
|
30
|
+
described_class.new(input, headers: [:title, :url, :author]).render.should == '[[1,2,"3"],["string","other string\n"]]'
|
31
31
|
end
|
32
32
|
|
33
33
|
it "should render input hash to json string" do
|
34
34
|
json = described_class.new(input_hash).render
|
35
|
-
json.should ==
|
36
|
-
hash = JSON.parse(json).symbolize_keys
|
37
|
-
hash
|
38
|
-
hash.should ==
|
35
|
+
json.should == '[{"title":1,"url":2,"author":3},{"title":"string","url":"other string\n","author":null}]'
|
36
|
+
hash = JSON.parse(json).map &:symbolize_keys
|
37
|
+
hash.each(&:symbolize_keys!)
|
38
|
+
hash.should == input_hash
|
39
39
|
end
|
40
40
|
end
|
41
41
|
|
@@ -50,7 +50,7 @@ describe WebCrawler::View::Xml do
|
|
50
50
|
"<response><title>1</title><url>2</url><author>3</author></response>" <<
|
51
51
|
"<response><title>string</title><url>other string\n</url><author></author></response>" <<
|
52
52
|
"</responses>"
|
53
|
-
described_class.new(input, headers: [:title, :url, :author]).render.should == xml
|
53
|
+
described_class.new(input, headers: [:title, :url, :author], pretty: false).render.should == xml
|
54
54
|
end
|
55
55
|
|
56
56
|
it "should render input array to pretty xml string" do
|
@@ -58,7 +58,7 @@ describe WebCrawler::View::Xml do
|
|
58
58
|
"<response><title>1</title><url>2</url><author>3</author></response>\n" <<
|
59
59
|
"<response><title>string</title><url>other string\n</url><author></author></response>\n" <<
|
60
60
|
"</responses>"
|
61
|
-
described_class.new(input, headers: [:title, :url, :author]
|
61
|
+
described_class.new(input, headers: [:title, :url, :author]).render.should == xml
|
62
62
|
end
|
63
63
|
|
64
64
|
it "should render input array without :headers to xml string" do
|
@@ -90,6 +90,6 @@ describe WebCrawler::View do
|
|
90
90
|
output = ""
|
91
91
|
io = StringIO.new(output)
|
92
92
|
WebCrawler::View.factory('json', [[1, 2, 3]]).draw(io)
|
93
|
-
output.should == "
|
93
|
+
output.should == "[[1,2,3]]\n"
|
94
94
|
end
|
95
95
|
end
|
@@ -0,0 +1,143 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe WebCrawler::Base do
|
4
|
+
require "fixtures/my_crawler"
|
5
|
+
|
6
|
+
before(:all) do
|
7
|
+
@uri_map = FakeWeb::Registry.instance.uri_map
|
8
|
+
FakeWeb.clean_registry
|
9
|
+
|
10
|
+
MyCrawler.targets.each do |url|
|
11
|
+
FakeWeb.register_uri(:get, url, :body => 'spec/fixtures/example.xml', :content_type => "text/html; charset=windows-1251")
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
after(:all) do
|
16
|
+
FakeWeb::Registry.instance.uri_map = @uri_map
|
17
|
+
end
|
18
|
+
|
19
|
+
describe ' > ', MyCrawler do
|
20
|
+
subject { MyCrawler.new }
|
21
|
+
|
22
|
+
it "should be instance of MyCrawler" do
|
23
|
+
subject.should be_a MyCrawler
|
24
|
+
subject.should be_a_kind_of described_class
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should have a target urls" do
|
28
|
+
subject.targets.should be_a ::Set
|
29
|
+
subject.targets.should have(20).urls
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should generate an urls" do
|
33
|
+
pattern = "www.example.com/category_:category/page:page/"
|
34
|
+
options = { :category => [1, 2, 3, 4], :page => 1..3 }
|
35
|
+
described_class.send(:generate_urls, pattern, options).should == ["www.example.com/category_1/page1/",
|
36
|
+
"www.example.com/category_1/page2/",
|
37
|
+
"www.example.com/category_1/page3/",
|
38
|
+
"www.example.com/category_2/page1/",
|
39
|
+
"www.example.com/category_2/page2/",
|
40
|
+
"www.example.com/category_2/page3/",
|
41
|
+
"www.example.com/category_3/page1/",
|
42
|
+
"www.example.com/category_3/page2/",
|
43
|
+
"www.example.com/category_3/page3/",
|
44
|
+
"www.example.com/category_4/page1/",
|
45
|
+
"www.example.com/category_4/page2/",
|
46
|
+
"www.example.com/category_4/page3/"]
|
47
|
+
end
|
48
|
+
|
49
|
+
it "logger should be attached to tmp/file.log" do
|
50
|
+
subject.logger.should be_a Logger
|
51
|
+
subject.logger.instance_variable_get(:@logdev).dev.path.should == '/tmp/file.log'
|
52
|
+
end
|
53
|
+
|
54
|
+
it "logger should be attached to Logger.new(STDERR)" do
|
55
|
+
class MyCrawler
|
56
|
+
log_to Logger.new(STDERR)
|
57
|
+
end
|
58
|
+
subject.logger.should be_a Logger
|
59
|
+
subject.logger.instance_variable_get(:@logdev).dev.should == STDERR
|
60
|
+
end
|
61
|
+
|
62
|
+
it "cache should be set" do
|
63
|
+
WebCrawler.config.cache.adapter.should be_a WebCrawler::CacheAdapter::Base
|
64
|
+
end
|
65
|
+
|
66
|
+
it "follow should collect urls from given url and fill targets" do
|
67
|
+
FakeWeb.register_uri(:get, urls_board_path, :body => follower_body)
|
68
|
+
FakeWeb.register_uri(:get, 'http://example.com/2323.html', :body => '')
|
69
|
+
FakeWeb.register_uri(:get, 'http://example.com/2323.html?rr=1', :body => '')
|
70
|
+
class TestCrawler < WebCrawler::Base
|
71
|
+
target 'http://example.com/follower' do |targets|
|
72
|
+
follow targets, :only => /\/\d+\.html/
|
73
|
+
end
|
74
|
+
end
|
75
|
+
TestCrawler.run
|
76
|
+
TestCrawler.targets.should == Set["http://example.com/2323.html", "http://example.com/2323.html?rr=1"]
|
77
|
+
end
|
78
|
+
|
79
|
+
context 'parsing' do
|
80
|
+
|
81
|
+
context 'context' do
|
82
|
+
|
83
|
+
it 'should initialize mappers' do
|
84
|
+
subject.mappers.should be_a Array
|
85
|
+
subject.mappers.should have(1).parser
|
86
|
+
subject.mappers.first.should be_a WebCrawler::Parsers::Mapper
|
87
|
+
end
|
88
|
+
|
89
|
+
context 'mapping' do
|
90
|
+
subject { MyCrawler.new.mappers.first.mapping }
|
91
|
+
|
92
|
+
let(:mapping_keys) { ["link",
|
93
|
+
"name",
|
94
|
+
"region",
|
95
|
+
"salary",
|
96
|
+
"description",
|
97
|
+
"contacts",
|
98
|
+
"company",
|
99
|
+
"published",
|
100
|
+
"expire",
|
101
|
+
"catalog item"] }
|
102
|
+
|
103
|
+
it { should be_a Hash }
|
104
|
+
it { should_not be_empty }
|
105
|
+
it { subject.keys.should == mapping_keys }
|
106
|
+
end
|
107
|
+
|
108
|
+
context 'run' do
|
109
|
+
it 'parse all elements and return Array' do
|
110
|
+
result = subject.run
|
111
|
+
result.should be_a Hash
|
112
|
+
result.keys.first.should == :jobs
|
113
|
+
result.values.flatten.should have(100).items
|
114
|
+
end
|
115
|
+
|
116
|
+
it 'parse all elements and return JSON string' do
|
117
|
+
result = subject.run :json
|
118
|
+
json = JSON.parse(result)
|
119
|
+
|
120
|
+
result.should be_a String
|
121
|
+
result.should =~ /\[{"source_link":/
|
122
|
+
json.should be_a Hash
|
123
|
+
json.values.flatten.should have(100).items
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'parse all elements and return JSON string' do
|
127
|
+
result = subject.run :yaml
|
128
|
+
yaml = YAML.load(result)
|
129
|
+
|
130
|
+
result.should be_a String
|
131
|
+
result.should =~ /^---/
|
132
|
+
yaml.should be_a Hash
|
133
|
+
yaml.values.flatten.should have(100).items
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
|
143
|
+
end
|