web_crawler 0.3.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +2 -0
- data/README +22 -1
- data/lib/web_crawler.rb +2 -0
- data/lib/web_crawler/application.rb +33 -2
- data/lib/web_crawler/base.rb +113 -0
- data/lib/web_crawler/batch_request.rb +10 -4
- data/lib/web_crawler/cached_request.rb +16 -7
- data/lib/web_crawler/configuration.rb +5 -5
- data/lib/web_crawler/factory_url.rb +27 -7
- data/lib/web_crawler/follower.rb +11 -9
- data/lib/web_crawler/parsers.rb +1 -0
- data/lib/web_crawler/parsers/mapper.rb +114 -0
- data/lib/web_crawler/parsers/url.rb +3 -5
- data/lib/web_crawler/request.rb +14 -2
- data/lib/web_crawler/response.rb +2 -2
- data/lib/web_crawler/version.rb +2 -2
- data/lib/web_crawler/view.rb +1 -1
- data/lib/web_crawler/view/csv.rb +1 -1
- data/lib/web_crawler/view/json.rb +1 -1
- data/lib/web_crawler/view/yaml.rb +1 -1
- data/spec/fixtures/example.xml +171 -0
- data/spec/fixtures/my_crawler.rb +82 -0
- data/spec/fixtures/test_crawler.rb +108 -0
- data/spec/fixtures/test_crawler2.rb +77 -0
- data/spec/spec_helper.rb +8 -3
- data/spec/web_crawler/batch_request_spec.rb +0 -11
- data/spec/web_crawler/cached_request_spec.rb +17 -11
- data/spec/web_crawler/factory_url_spec.rb +19 -6
- data/spec/web_crawler/follow_spec.rb +11 -4
- data/spec/web_crawler/view_spec.rb +10 -10
- data/spec/web_crawler/web_crawler_api_base_class_spec.rb +143 -0
- data/web_crawler.gemspec +2 -0
- metadata +43 -8
@@ -0,0 +1,108 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
|
3
|
+
class TestCrawler < WebCrawler::Base
|
4
|
+
target 'http://45.ru/job/vacancy/2.php' do |targets|
|
5
|
+
follow targets, :only => /\/job\/vacancy\/\d+\.html/
|
6
|
+
end
|
7
|
+
|
8
|
+
cache_to '/tmp/wcrawler_cache'
|
9
|
+
|
10
|
+
log_to nil
|
11
|
+
|
12
|
+
context '#block_center > table:first', :vacancies do |table|
|
13
|
+
table.search('td.bg_color2').map do |key|
|
14
|
+
[key.inner_text.strip, key.search('~').inner_text.strip]
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
protected
|
19
|
+
|
20
|
+
class <<self
|
21
|
+
|
22
|
+
def process(*)
|
23
|
+
normalize_data(super)
|
24
|
+
end
|
25
|
+
|
26
|
+
def normalize_data(data)
|
27
|
+
data[:vacancies].map do |vacancy|
|
28
|
+
Hash[vacancy.map { |key, value| [translate_key(key), normalize_value(value)] }]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def translate_key(key)
|
33
|
+
{ "Город" => :city_name,
|
34
|
+
"Фирма" => :company_name,
|
35
|
+
"График работы" => :schedule,
|
36
|
+
"Тип работы" => :employment_type,
|
37
|
+
"Зарплата, руб." => :profit,
|
38
|
+
"Должность" => :name,
|
39
|
+
"Условия" => :conditions,
|
40
|
+
"Требования" => :requirements,
|
41
|
+
"Обязанности" => :responsibilities,
|
42
|
+
"О компании" => :company_description,
|
43
|
+
"Знание языков" => :known_languages,
|
44
|
+
"Знание компьютера" => :known_computer,
|
45
|
+
"Образование" => :education,
|
46
|
+
"Место работы (район)" => :place_of_work,
|
47
|
+
"Бизнес-образование" => :business_education,
|
48
|
+
"Телефон" => :phone,
|
49
|
+
"Контактное лицо" => :contact_name,
|
50
|
+
"E-mail" => :email,
|
51
|
+
"Адрес" => :address,
|
52
|
+
"Стаж" => :experience,
|
53
|
+
"http://" => :site,
|
54
|
+
"url" => :url }[key]
|
55
|
+
end
|
56
|
+
|
57
|
+
def normalize_value(value)
|
58
|
+
value.gsub(/\t+/,' ')
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
__END__
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
Город Курган
|
70
|
+
Фирма Розничная сеть Л`Этуаль
|
71
|
+
Должность Продавец-консультант ЛЭтуаль
|
72
|
+
Зарплата, руб. 20 000
|
73
|
+
Форма оплаты Оклад+%
|
74
|
+
График работы Полный рабочий день
|
75
|
+
Тип работы Постоянная
|
76
|
+
Условия Условия работы:
|
77
|
+
• Корпоративное обучение
|
78
|
+
• График работы сменный;
|
79
|
+
• Оформление по Трудовому Кодексу РФ, соц. Пакет
|
80
|
+
• Конкурентная заработная плата, оклад + %
|
81
|
+
|
82
|
+
В компании разработана уникальная программа карьерного роста для наших сотрудников!
|
83
|
+
Требования Требования:
|
84
|
+
• Образование средне-специальное, высшее
|
85
|
+
• Опыт продаж от 1 года,
|
86
|
+
• Возраст от 23 до 35 лет, приятный внешний вид
|
87
|
+
• Умение находить контакт с любым покупателем
|
88
|
+
• Готовность изменить свой имидж в соответствии с корпоративными требованиями компании
|
89
|
+
• Высокая работоспособность, активная жизненная позиция
|
90
|
+
• Стрессоустойчивость, хорошая память, желание работать и развиваться
|
91
|
+
Обязанности Обязанности:
|
92
|
+
• Грамотное консультирование клиента по ассортименту
|
93
|
+
• Продажа косметики и парфюмерии
|
94
|
+
• Работа с кассой
|
95
|
+
• Мерчендайзинг
|
96
|
+
• Соблюдение и поддержание стандартов компании
|
97
|
+
• Поддержание чистоты рабочего места
|
98
|
+
О компании Продавец-консультант - это лицо нашей компании. От качества работы, выполняемой им, зависит общий успех – его и компании. Мы декларируем как основные преимущества нашей компании – уникальный дизайн, богатый ассортимент продукции, гибкую систему скидок и специальные предложения, а самое главное – грамотность, профессионализм и вежливость продавцов-консультантов наших магазинов!
|
99
|
+
|
100
|
+
ОБЯЗАТЕЛЬНО УКАЗЫВАЙТЕ в теме письма "Продавец-консультант г Курган"
|
101
|
+
Образование Среднее специальное
|
102
|
+
Стаж 1
|
103
|
+
Степень ограничения трудоспособности Отсутствует
|
104
|
+
Телефон 8-982-602-8331
|
105
|
+
Контактное лицо Татьяна Александровна
|
106
|
+
E-mail hrm1-svx-ur@letuin.ru
|
107
|
+
http:// www.letoile.ru%2F
|
108
|
+
Документы для скачивания
|
@@ -0,0 +1,77 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
|
3
|
+
class TestCrawler2 < WebCrawler::Base
|
4
|
+
|
5
|
+
target "http://www.superjob.ru/export/vacs_to_xml.php"
|
6
|
+
|
7
|
+
log_to "/tmp/file.log" # or Logger.new(...)
|
8
|
+
|
9
|
+
cache_to '/tmp/wcrawler/cache' # or (CacheClass < CacheAdapter).new *args
|
10
|
+
|
11
|
+
context "job", :jobs do
|
12
|
+
|
13
|
+
map 'link', :to => :source_link, :on => :inner_text # default :on => :inner_text
|
14
|
+
map 'name', :to => :name
|
15
|
+
map 'region', :to => :city_name
|
16
|
+
map 'salary', :to => :profit
|
17
|
+
map 'description', :to => :description, :filter => :format_description
|
18
|
+
map 'contacts', :to => :contact_text
|
19
|
+
map 'company', :to => :company, :on => [:attr, :id]
|
20
|
+
map 'published', :to => :published_at
|
21
|
+
map 'expire', :to => :expire_at
|
22
|
+
map 'catalog item', :to => :specialization_ids, :on => nil, :filter => :convert_specs
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
protected
|
27
|
+
|
28
|
+
def self.format_description(text)
|
29
|
+
@titles ||= ["Условия работы и компенсации:\n",
|
30
|
+
"Место работы:\n",
|
31
|
+
"Должностные обязанности:\n",
|
32
|
+
"Требования к квалификации:\n"]
|
33
|
+
|
34
|
+
text.each_line.inject("") { |new_text, line| new_text << (@titles.include?(line) ? "<h4>#{line.chomp}</h4>\n" : line) }
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.convert_specs(specs)
|
38
|
+
@ids_mapping ||= {
|
39
|
+
911 => 4537,
|
40
|
+
1 => 4274,
|
41
|
+
5 => 4335,
|
42
|
+
6 => 4408,
|
43
|
+
16 => [4756, 4545],
|
44
|
+
3 => 4488,
|
45
|
+
9 => 4303,
|
46
|
+
8 => 4649,
|
47
|
+
547 => 4237,
|
48
|
+
579 => 4237,
|
49
|
+
1104 => 4671,
|
50
|
+
10 => 4588,
|
51
|
+
814 => 4568,
|
52
|
+
2 => 4714,
|
53
|
+
11 => 4671,
|
54
|
+
13 => 4691,
|
55
|
+
15 => 4649,
|
56
|
+
17 => 4504,
|
57
|
+
601 => 4428,
|
58
|
+
45 => 4632,
|
59
|
+
22 => 4473,
|
60
|
+
515 => 4524,
|
61
|
+
19 => 4473,
|
62
|
+
20 => 4524,
|
63
|
+
398 => 4749,
|
64
|
+
503 => 4775,
|
65
|
+
941 => 4742,
|
66
|
+
1434 => 4802,
|
67
|
+
2109 => 4537
|
68
|
+
}
|
69
|
+
specs.map { |i| @ids_mapping[i['thread'].to_i] }.to_a.flatten
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
#MyCrawler.run # => return Array
|
76
|
+
#MyCrawler.run(:json) # => return String like a JSON object
|
77
|
+
#MyCrawler.run(:yaml) # => return String of YAML format
|
data/spec/spec_helper.rb
CHANGED
@@ -9,9 +9,14 @@ require 'fake_web_generator'
|
|
9
9
|
RSpec.configure do |c|
|
10
10
|
c.mock_with :rspec
|
11
11
|
c.include FakeWebGenerator
|
12
|
-
end
|
13
12
|
|
14
|
-
|
15
|
-
|
13
|
+
c.before(:each) do
|
14
|
+
WebCrawler.configure do
|
15
|
+
config.logger = nil
|
16
|
+
config.cache_adapter = WebCrawler::CacheAdapter::Memory.new
|
17
|
+
config.logger.level = Logger::ERROR
|
18
|
+
end
|
19
|
+
end
|
16
20
|
end
|
17
21
|
|
22
|
+
|
@@ -27,19 +27,8 @@ describe WebCrawler::BatchRequest do
|
|
27
27
|
end
|
28
28
|
|
29
29
|
it "should process requests" do
|
30
|
-
subject.requests.map { |r| r.should_receive(:process).with(no_args).and_return(responses.first) }
|
31
30
|
subject.process.should be_a Array
|
32
31
|
subject.process.first.should be_a WebCrawler::Response
|
33
32
|
end
|
34
33
|
|
35
|
-
it "should accept :parser option with parser class or object" do
|
36
|
-
class ::TestParser
|
37
|
-
def parse(resp)
|
38
|
-
resp.to_s + ' parsed'
|
39
|
-
end
|
40
|
-
end
|
41
|
-
described_class.new(urls, parser: TestParser.new).process.should == ["Example body parsed",
|
42
|
-
"Example body1 parsed",
|
43
|
-
"Example body for url http://example.com/2 parsed"]
|
44
|
-
end
|
45
34
|
end
|
@@ -11,21 +11,27 @@ describe 'Cached requests' do
|
|
11
11
|
let(:urls) { ['example.com/1', 'example.com/2', 'example.com'] }
|
12
12
|
|
13
13
|
it 'should not send requests to the web if cache exists' do
|
14
|
-
FakeWeb.register_uri(:get, "http://example.com/
|
15
|
-
first_response = FakeWeb.response_for :get, "http://example.com/
|
14
|
+
FakeWeb.register_uri(:get, "http://example.com/cached", :body => "cached Example body1")
|
15
|
+
first_response = FakeWeb.response_for :get, "http://example.com/cached"
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
lambda {
|
20
|
-
WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process
|
21
|
-
}.should raise_error(ArgumentError, /response must be a Net::HTTPResponse/)
|
17
|
+
WebCrawler::BatchRequest.new("http://example.com/cached").process
|
18
|
+
WebCrawler::config.cache.adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/cached"), first_response))
|
22
19
|
|
20
|
+
cached_response = WebCrawler::config.cache.adapter.get("http://example.com/cached")
|
23
21
|
FakeWeb.should_not_receive(:response_for)
|
24
22
|
|
25
|
-
WebCrawler::
|
26
|
-
|
27
|
-
cached_response = WebCrawler::config.cache_adapter.get("http://example.com/1")
|
28
|
-
WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process.first.should be cached_response
|
23
|
+
WebCrawler::BatchRequest.new("http://example.com/cached").process.first.should be cached_response
|
29
24
|
end
|
30
25
|
|
26
|
+
it 'should not be cached' do
|
27
|
+
FakeWeb.register_uri(:get, "http://example.com/cached", :body => "cached Example body1")
|
28
|
+
first_response = FakeWeb.response_for :get, "http://example.com/cached"
|
29
|
+
|
30
|
+
WebCrawler::BatchRequest.new("http://example.com/cached").process
|
31
|
+
WebCrawler::config.cache.adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/cached"), first_response))
|
32
|
+
|
33
|
+
cached_response = WebCrawler::config.cache.adapter.get("http://example.com/cached")
|
34
|
+
|
35
|
+
WebCrawler::BatchRequest.new("http://example.com/cached", no_cached: true).process.first.should_not be cached_response
|
36
|
+
end
|
31
37
|
end
|
@@ -3,14 +3,14 @@ require "spec_helper"
|
|
3
3
|
describe WebCrawler::FactoryUrl do
|
4
4
|
|
5
5
|
it "should generate urls with block" do
|
6
|
-
first_param
|
6
|
+
first_param = [1, 2, 3]
|
7
7
|
second_param = 10...15
|
8
8
|
|
9
9
|
factory = WebCrawler::FactoryUrl.new(first_param, second_param) do |*args|
|
10
10
|
random = rand(3000)
|
11
11
|
"www.example.com/%s/%s.html?rid=#{random}" % args
|
12
12
|
end
|
13
|
-
urls
|
13
|
+
urls = factory.factory
|
14
14
|
|
15
15
|
urls.should be_a Array
|
16
16
|
factory.params.size.should == 15
|
@@ -19,16 +19,29 @@ describe WebCrawler::FactoryUrl do
|
|
19
19
|
end
|
20
20
|
|
21
21
|
it "should generate urls with pattern" do
|
22
|
-
first_param
|
22
|
+
first_param = [1, 2, 3]
|
23
23
|
second_param = 10...15
|
24
24
|
|
25
25
|
factory = WebCrawler::FactoryUrl.new("www.example.com/$1/$2.html", first_param, second_param)
|
26
|
-
urls
|
26
|
+
urls = factory.factory
|
27
27
|
|
28
28
|
urls.should be_a Array
|
29
|
-
factory.params.
|
29
|
+
factory.params.should have(15).items
|
30
30
|
urls.should have(factory.params.size).urls
|
31
31
|
urls.first.should == "www.example.com/1/10.html"
|
32
32
|
end
|
33
|
-
|
33
|
+
|
34
|
+
it "should generate urls with pattern and hash options" do
|
35
|
+
pattern = "www.example.com/category_:category/page:page/"
|
36
|
+
options = { :page => 1..3, :category => [1, 2, 3, 4] }
|
37
|
+
|
38
|
+
factory = WebCrawler::FactoryUrl.new(pattern, options)
|
39
|
+
urls = factory.factory
|
40
|
+
|
41
|
+
urls.should be_a Array
|
42
|
+
factory.params.should have(12).items
|
43
|
+
urls.should have(factory.params.size).urls
|
44
|
+
urls.first.should == "www.example.com/category_1/page1/"
|
45
|
+
end
|
46
|
+
|
34
47
|
end
|
@@ -7,16 +7,23 @@ describe WebCrawler::Follower do
|
|
7
7
|
responses = WebCrawler::BatchRequest.new(urls_board_path).process
|
8
8
|
urls = WebCrawler::Follower.new(responses).collect
|
9
9
|
|
10
|
-
urls.
|
11
|
-
urls.
|
10
|
+
urls.should have(9).urls
|
11
|
+
urls.should == known_urls
|
12
12
|
end
|
13
13
|
|
14
14
|
it "should collect all the unique url with same host like in responses" do
|
15
15
|
responses = WebCrawler::BatchRequest.new(urls_board_path).process
|
16
16
|
urls = WebCrawler::Follower.new(responses, same_host: true).collect
|
17
17
|
|
18
|
-
urls.
|
19
|
-
urls.
|
18
|
+
urls.should have(6).urls
|
19
|
+
urls.should == known_urls.reject { |u| u =~ /otherhost/ }
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should collect all the unique url like a given regexp" do
|
23
|
+
responses = WebCrawler::BatchRequest.new(urls_board_path).process
|
24
|
+
urls = WebCrawler::Follower.new(responses, only: /\/\d+\.html/).collect
|
25
|
+
urls.should have(2).urls
|
26
|
+
urls.should == known_urls.select { |u| u =~ /\/\d+\.html/ }
|
20
27
|
end
|
21
28
|
|
22
29
|
it "should process requests for following urls" do
|
@@ -15,8 +15,8 @@ describe WebCrawler::View::Csv do
|
|
15
15
|
end
|
16
16
|
|
17
17
|
it "should render input array to csv string with options" do
|
18
|
-
described_class.new(input, headers: [:title, :url, :author], col_sep: ";").render.should == "title;url;author\n1;2;3\nstring;\"other string\n\"\n"
|
19
|
-
described_class.new(input, headers: [:title, :url, :author], row_sep: "\n\n").render.should == "title,url,author\n\n1,2,3\n\nstring,\"other string\n\"\n\n"
|
18
|
+
described_class.new(input, headers: [:title, :url, :author], csv: {col_sep: ";"}).render.should == "title;url;author\n1;2;3\nstring;\"other string\n\"\n"
|
19
|
+
described_class.new(input, headers: [:title, :url, :author], csv: {row_sep: "\n\n"}).render.should == "title,url,author\n\n1,2,3\n\nstring,\"other string\n\"\n\n"
|
20
20
|
end
|
21
21
|
|
22
22
|
end
|
@@ -27,15 +27,15 @@ describe WebCrawler::View::Json do
|
|
27
27
|
let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n", :author=>nil }] }
|
28
28
|
|
29
29
|
it "should render input array to json string" do
|
30
|
-
described_class.new(input, headers: [:title, :url, :author]).render.should == '
|
30
|
+
described_class.new(input, headers: [:title, :url, :author]).render.should == '[[1,2,"3"],["string","other string\n"]]'
|
31
31
|
end
|
32
32
|
|
33
33
|
it "should render input hash to json string" do
|
34
34
|
json = described_class.new(input_hash).render
|
35
|
-
json.should ==
|
36
|
-
hash = JSON.parse(json).symbolize_keys
|
37
|
-
hash
|
38
|
-
hash.should ==
|
35
|
+
json.should == '[{"title":1,"url":2,"author":3},{"title":"string","url":"other string\n","author":null}]'
|
36
|
+
hash = JSON.parse(json).map &:symbolize_keys
|
37
|
+
hash.each(&:symbolize_keys!)
|
38
|
+
hash.should == input_hash
|
39
39
|
end
|
40
40
|
end
|
41
41
|
|
@@ -50,7 +50,7 @@ describe WebCrawler::View::Xml do
|
|
50
50
|
"<response><title>1</title><url>2</url><author>3</author></response>" <<
|
51
51
|
"<response><title>string</title><url>other string\n</url><author></author></response>" <<
|
52
52
|
"</responses>"
|
53
|
-
described_class.new(input, headers: [:title, :url, :author]).render.should == xml
|
53
|
+
described_class.new(input, headers: [:title, :url, :author], pretty: false).render.should == xml
|
54
54
|
end
|
55
55
|
|
56
56
|
it "should render input array to pretty xml string" do
|
@@ -58,7 +58,7 @@ describe WebCrawler::View::Xml do
|
|
58
58
|
"<response><title>1</title><url>2</url><author>3</author></response>\n" <<
|
59
59
|
"<response><title>string</title><url>other string\n</url><author></author></response>\n" <<
|
60
60
|
"</responses>"
|
61
|
-
described_class.new(input, headers: [:title, :url, :author]
|
61
|
+
described_class.new(input, headers: [:title, :url, :author]).render.should == xml
|
62
62
|
end
|
63
63
|
|
64
64
|
it "should render input array without :headers to xml string" do
|
@@ -90,6 +90,6 @@ describe WebCrawler::View do
|
|
90
90
|
output = ""
|
91
91
|
io = StringIO.new(output)
|
92
92
|
WebCrawler::View.factory('json', [[1, 2, 3]]).draw(io)
|
93
|
-
output.should == "
|
93
|
+
output.should == "[[1,2,3]]\n"
|
94
94
|
end
|
95
95
|
end
|
@@ -0,0 +1,143 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe WebCrawler::Base do
|
4
|
+
require "fixtures/my_crawler"
|
5
|
+
|
6
|
+
before(:all) do
|
7
|
+
@uri_map = FakeWeb::Registry.instance.uri_map
|
8
|
+
FakeWeb.clean_registry
|
9
|
+
|
10
|
+
MyCrawler.targets.each do |url|
|
11
|
+
FakeWeb.register_uri(:get, url, :body => 'spec/fixtures/example.xml', :content_type => "text/html; charset=windows-1251")
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
after(:all) do
|
16
|
+
FakeWeb::Registry.instance.uri_map = @uri_map
|
17
|
+
end
|
18
|
+
|
19
|
+
describe ' > ', MyCrawler do
|
20
|
+
subject { MyCrawler.new }
|
21
|
+
|
22
|
+
it "should be instance of MyCrawler" do
|
23
|
+
subject.should be_a MyCrawler
|
24
|
+
subject.should be_a_kind_of described_class
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should have a target urls" do
|
28
|
+
subject.targets.should be_a ::Set
|
29
|
+
subject.targets.should have(20).urls
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should generate an urls" do
|
33
|
+
pattern = "www.example.com/category_:category/page:page/"
|
34
|
+
options = { :category => [1, 2, 3, 4], :page => 1..3 }
|
35
|
+
described_class.send(:generate_urls, pattern, options).should == ["www.example.com/category_1/page1/",
|
36
|
+
"www.example.com/category_1/page2/",
|
37
|
+
"www.example.com/category_1/page3/",
|
38
|
+
"www.example.com/category_2/page1/",
|
39
|
+
"www.example.com/category_2/page2/",
|
40
|
+
"www.example.com/category_2/page3/",
|
41
|
+
"www.example.com/category_3/page1/",
|
42
|
+
"www.example.com/category_3/page2/",
|
43
|
+
"www.example.com/category_3/page3/",
|
44
|
+
"www.example.com/category_4/page1/",
|
45
|
+
"www.example.com/category_4/page2/",
|
46
|
+
"www.example.com/category_4/page3/"]
|
47
|
+
end
|
48
|
+
|
49
|
+
it "logger should be attached to tmp/file.log" do
|
50
|
+
subject.logger.should be_a Logger
|
51
|
+
subject.logger.instance_variable_get(:@logdev).dev.path.should == '/tmp/file.log'
|
52
|
+
end
|
53
|
+
|
54
|
+
it "logger should be attached to Logger.new(STDERR)" do
|
55
|
+
class MyCrawler
|
56
|
+
log_to Logger.new(STDERR)
|
57
|
+
end
|
58
|
+
subject.logger.should be_a Logger
|
59
|
+
subject.logger.instance_variable_get(:@logdev).dev.should == STDERR
|
60
|
+
end
|
61
|
+
|
62
|
+
it "cache should be set" do
|
63
|
+
WebCrawler.config.cache.adapter.should be_a WebCrawler::CacheAdapter::Base
|
64
|
+
end
|
65
|
+
|
66
|
+
it "follow should collect urls from given url and fill targets" do
|
67
|
+
FakeWeb.register_uri(:get, urls_board_path, :body => follower_body)
|
68
|
+
FakeWeb.register_uri(:get, 'http://example.com/2323.html', :body => '')
|
69
|
+
FakeWeb.register_uri(:get, 'http://example.com/2323.html?rr=1', :body => '')
|
70
|
+
class TestCrawler < WebCrawler::Base
|
71
|
+
target 'http://example.com/follower' do |targets|
|
72
|
+
follow targets, :only => /\/\d+\.html/
|
73
|
+
end
|
74
|
+
end
|
75
|
+
TestCrawler.run
|
76
|
+
TestCrawler.targets.should == Set["http://example.com/2323.html", "http://example.com/2323.html?rr=1"]
|
77
|
+
end
|
78
|
+
|
79
|
+
context 'parsing' do
|
80
|
+
|
81
|
+
context 'context' do
|
82
|
+
|
83
|
+
it 'should initialize mappers' do
|
84
|
+
subject.mappers.should be_a Array
|
85
|
+
subject.mappers.should have(1).parser
|
86
|
+
subject.mappers.first.should be_a WebCrawler::Parsers::Mapper
|
87
|
+
end
|
88
|
+
|
89
|
+
context 'mapping' do
|
90
|
+
subject { MyCrawler.new.mappers.first.mapping }
|
91
|
+
|
92
|
+
let(:mapping_keys) { ["link",
|
93
|
+
"name",
|
94
|
+
"region",
|
95
|
+
"salary",
|
96
|
+
"description",
|
97
|
+
"contacts",
|
98
|
+
"company",
|
99
|
+
"published",
|
100
|
+
"expire",
|
101
|
+
"catalog item"] }
|
102
|
+
|
103
|
+
it { should be_a Hash }
|
104
|
+
it { should_not be_empty }
|
105
|
+
it { subject.keys.should == mapping_keys }
|
106
|
+
end
|
107
|
+
|
108
|
+
context 'run' do
|
109
|
+
it 'parse all elements and return Array' do
|
110
|
+
result = subject.run
|
111
|
+
result.should be_a Hash
|
112
|
+
result.keys.first.should == :jobs
|
113
|
+
result.values.flatten.should have(100).items
|
114
|
+
end
|
115
|
+
|
116
|
+
it 'parse all elements and return JSON string' do
|
117
|
+
result = subject.run :json
|
118
|
+
json = JSON.parse(result)
|
119
|
+
|
120
|
+
result.should be_a String
|
121
|
+
result.should =~ /\[{"source_link":/
|
122
|
+
json.should be_a Hash
|
123
|
+
json.values.flatten.should have(100).items
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'parse all elements and return JSON string' do
|
127
|
+
result = subject.run :yaml
|
128
|
+
yaml = YAML.load(result)
|
129
|
+
|
130
|
+
result.should be_a String
|
131
|
+
result.should =~ /^---/
|
132
|
+
yaml.should be_a Hash
|
133
|
+
yaml.values.flatten.should have(100).items
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
|
143
|
+
end
|