web_crawler 0.3.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,108 @@
1
+ #coding: utf-8
2
+
3
+ class TestCrawler < WebCrawler::Base
4
+ target 'http://45.ru/job/vacancy/2.php' do |targets|
5
+ follow targets, :only => /\/job\/vacancy\/\d+\.html/
6
+ end
7
+
8
+ cache_to '/tmp/wcrawler_cache'
9
+
10
+ log_to nil
11
+
12
+ context '#block_center > table:first', :vacancies do |table|
13
+ table.search('td.bg_color2').map do |key|
14
+ [key.inner_text.strip, key.search('~').inner_text.strip]
15
+ end
16
+ end
17
+
18
+ protected
19
+
20
+ class <<self
21
+
22
+ def process(*)
23
+ normalize_data(super)
24
+ end
25
+
26
+ def normalize_data(data)
27
+ data[:vacancies].map do |vacancy|
28
+ Hash[vacancy.map { |key, value| [translate_key(key), normalize_value(value)] }]
29
+ end
30
+ end
31
+
32
+ def translate_key(key)
33
+ { "Город" => :city_name,
34
+ "Фирма" => :company_name,
35
+ "График работы" => :schedule,
36
+ "Тип работы" => :employment_type,
37
+ "Зарплата, руб." => :profit,
38
+ "Должность" => :name,
39
+ "Условия" => :conditions,
40
+ "Требования" => :requirements,
41
+ "Обязанности" => :responsibilities,
42
+ "О компании" => :company_description,
43
+ "Знание языков" => :known_languages,
44
+ "Знание компьютера" => :known_computer,
45
+ "Образование" => :education,
46
+ "Место работы (район)" => :place_of_work,
47
+ "Бизнес-образование" => :business_education,
48
+ "Телефон" => :phone,
49
+ "Контактное лицо" => :contact_name,
50
+ "E-mail" => :email,
51
+ "Адрес" => :address,
52
+ "Стаж" => :experience,
53
+ "http://" => :site,
54
+ "url" => :url }[key]
55
+ end
56
+
57
+ def normalize_value(value)
58
+ value.gsub(/\t+/,' ')
59
+ end
60
+
61
+ end
62
+
63
+ end
64
+
65
+ __END__
66
+
67
+
68
+
69
+ Город Курган
70
+ Фирма Розничная сеть Л`Этуаль
71
+ Должность Продавец-консультант ЛЭтуаль
72
+ Зарплата, руб. 20 000
73
+ Форма оплаты Оклад+%
74
+ График работы Полный рабочий день
75
+ Тип работы Постоянная
76
+ Условия Условия работы:
77
+ • Корпоративное обучение
78
+ • График работы сменный;
79
+ • Оформление по Трудовому Кодексу РФ, соц. Пакет
80
+ • Конкурентная заработная плата, оклад + %
81
+
82
+ В компании разработана уникальная программа карьерного роста для наших сотрудников!
83
+ Требования Требования:
84
+ • Образование средне-специальное, высшее
85
+ • Опыт продаж от 1 года,
86
+ • Возраст от 23 до 35 лет, приятный внешний вид
87
+ • Умение находить контакт с любым покупателем
88
+ • Готовность изменить свой имидж в соответствии с корпоративными требованиями компании
89
+ • Высокая работоспособность, активная жизненная позиция
90
+ • Стрессоустойчивость, хорошая память, желание работать и развиваться
91
+ Обязанности Обязанности:
92
+ • Грамотное консультирование клиента по ассортименту
93
+ • Продажа косметики и парфюмерии
94
+ • Работа с кассой
95
+ • Мерчендайзинг
96
+ • Соблюдение и поддержание стандартов компании
97
+ • Поддержание чистоты рабочего места
98
+ О компании Продавец-консультант - это лицо нашей компании. От качества работы, выполняемой им, зависит общий успех – его и компании. Мы декларируем как основные преимущества нашей компании – уникальный дизайн, богатый ассортимент продукции, гибкую систему скидок и специальные предложения, а самое главное – грамотность, профессионализм и вежливость продавцов-консультантов наших магазинов!
99
+
100
+ ОБЯЗАТЕЛЬНО УКАЗЫВАЙТЕ в теме письма "Продавец-консультант г Курган"
101
+ Образование Среднее специальное
102
+ Стаж 1
103
+ Степень ограничения трудоспособности Отсутствует
104
+ Телефон 8-982-602-8331
105
+ Контактное лицо Татьяна Александровна
106
+ E-mail hrm1-svx-ur@letuin.ru
107
+ http:// www.letoile.ru%2F
108
+ Документы для скачивания
@@ -0,0 +1,77 @@
1
+ #encoding: utf-8
2
+
3
+ class TestCrawler2 < WebCrawler::Base
4
+
5
+ target "http://www.superjob.ru/export/vacs_to_xml.php"
6
+
7
+ log_to "/tmp/file.log" # or Logger.new(...)
8
+
9
+ cache_to '/tmp/wcrawler/cache' # or (CacheClass < CacheAdapter).new *args
10
+
11
+ context "job", :jobs do
12
+
13
+ map 'link', :to => :source_link, :on => :inner_text # default :on => :inner_text
14
+ map 'name', :to => :name
15
+ map 'region', :to => :city_name
16
+ map 'salary', :to => :profit
17
+ map 'description', :to => :description, :filter => :format_description
18
+ map 'contacts', :to => :contact_text
19
+ map 'company', :to => :company, :on => [:attr, :id]
20
+ map 'published', :to => :published_at
21
+ map 'expire', :to => :expire_at
22
+ map 'catalog item', :to => :specialization_ids, :on => nil, :filter => :convert_specs
23
+
24
+ end
25
+
26
+ protected
27
+
28
+ def self.format_description(text)
29
+ @titles ||= ["Условия работы и компенсации:\n",
30
+ "Место работы:\n",
31
+ "Должностные обязанности:\n",
32
+ "Требования к квалификации:\n"]
33
+
34
+ text.each_line.inject("") { |new_text, line| new_text << (@titles.include?(line) ? "<h4>#{line.chomp}</h4>\n" : line) }
35
+ end
36
+
37
+ def self.convert_specs(specs)
38
+ @ids_mapping ||= {
39
+ 911 => 4537,
40
+ 1 => 4274,
41
+ 5 => 4335,
42
+ 6 => 4408,
43
+ 16 => [4756, 4545],
44
+ 3 => 4488,
45
+ 9 => 4303,
46
+ 8 => 4649,
47
+ 547 => 4237,
48
+ 579 => 4237,
49
+ 1104 => 4671,
50
+ 10 => 4588,
51
+ 814 => 4568,
52
+ 2 => 4714,
53
+ 11 => 4671,
54
+ 13 => 4691,
55
+ 15 => 4649,
56
+ 17 => 4504,
57
+ 601 => 4428,
58
+ 45 => 4632,
59
+ 22 => 4473,
60
+ 515 => 4524,
61
+ 19 => 4473,
62
+ 20 => 4524,
63
+ 398 => 4749,
64
+ 503 => 4775,
65
+ 941 => 4742,
66
+ 1434 => 4802,
67
+ 2109 => 4537
68
+ }
69
+ specs.map { |i| @ids_mapping[i['thread'].to_i] }.to_a.flatten
70
+ end
71
+
72
+ end
73
+
74
+
75
+ #MyCrawler.run # => return Array
76
+ #MyCrawler.run(:json) # => return String like a JSON object
77
+ #MyCrawler.run(:yaml) # => return String of YAML format
data/spec/spec_helper.rb CHANGED
@@ -9,9 +9,14 @@ require 'fake_web_generator'
9
9
  RSpec.configure do |c|
10
10
  c.mock_with :rspec
11
11
  c.include FakeWebGenerator
12
- end
13
12
 
14
- WebCrawler.configure do
15
- config.cache_adapter = WebCrawler::CacheAdapter::Memory.new
13
+ c.before(:each) do
14
+ WebCrawler.configure do
15
+ config.logger = nil
16
+ config.cache_adapter = WebCrawler::CacheAdapter::Memory.new
17
+ config.logger.level = Logger::ERROR
18
+ end
19
+ end
16
20
  end
17
21
 
22
+
@@ -27,19 +27,8 @@ describe WebCrawler::BatchRequest do
27
27
  end
28
28
 
29
29
  it "should process requests" do
30
- subject.requests.map { |r| r.should_receive(:process).with(no_args).and_return(responses.first) }
31
30
  subject.process.should be_a Array
32
31
  subject.process.first.should be_a WebCrawler::Response
33
32
  end
34
33
 
35
- it "should accept :parser option with parser class or object" do
36
- class ::TestParser
37
- def parse(resp)
38
- resp.to_s + ' parsed'
39
- end
40
- end
41
- described_class.new(urls, parser: TestParser.new).process.should == ["Example body parsed",
42
- "Example body1 parsed",
43
- "Example body for url http://example.com/2 parsed"]
44
- end
45
34
  end
@@ -11,21 +11,27 @@ describe 'Cached requests' do
11
11
  let(:urls) { ['example.com/1', 'example.com/2', 'example.com'] }
12
12
 
13
13
  it 'should not send requests to the web if cache exists' do
14
- FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
15
- first_response = FakeWeb.response_for :get, "http://example.com/1"
14
+ FakeWeb.register_uri(:get, "http://example.com/cached", :body => "cached Example body1")
15
+ first_response = FakeWeb.response_for :get, "http://example.com/cached"
16
16
 
17
- FakeWeb.should_receive(:response_for).with(:get, "http://example.com/1").and_return { first_response }
18
-
19
- lambda {
20
- WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process
21
- }.should raise_error(ArgumentError, /response must be a Net::HTTPResponse/)
17
+ WebCrawler::BatchRequest.new("http://example.com/cached").process
18
+ WebCrawler::config.cache.adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/cached"), first_response))
22
19
 
20
+ cached_response = WebCrawler::config.cache.adapter.get("http://example.com/cached")
23
21
  FakeWeb.should_not_receive(:response_for)
24
22
 
25
- WebCrawler::config.cache_adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/1"), first_response))
26
-
27
- cached_response = WebCrawler::config.cache_adapter.get("http://example.com/1")
28
- WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process.first.should be cached_response
23
+ WebCrawler::BatchRequest.new("http://example.com/cached").process.first.should be cached_response
29
24
  end
30
25
 
26
+ it 'should not be cached' do
27
+ FakeWeb.register_uri(:get, "http://example.com/cached", :body => "cached Example body1")
28
+ first_response = FakeWeb.response_for :get, "http://example.com/cached"
29
+
30
+ WebCrawler::BatchRequest.new("http://example.com/cached").process
31
+ WebCrawler::config.cache.adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/cached"), first_response))
32
+
33
+ cached_response = WebCrawler::config.cache.adapter.get("http://example.com/cached")
34
+
35
+ WebCrawler::BatchRequest.new("http://example.com/cached", no_cached: true).process.first.should_not be cached_response
36
+ end
31
37
  end
@@ -3,14 +3,14 @@ require "spec_helper"
3
3
  describe WebCrawler::FactoryUrl do
4
4
 
5
5
  it "should generate urls with block" do
6
- first_param = [1,2,3]
6
+ first_param = [1, 2, 3]
7
7
  second_param = 10...15
8
8
 
9
9
  factory = WebCrawler::FactoryUrl.new(first_param, second_param) do |*args|
10
10
  random = rand(3000)
11
11
  "www.example.com/%s/%s.html?rid=#{random}" % args
12
12
  end
13
- urls = factory.factory
13
+ urls = factory.factory
14
14
 
15
15
  urls.should be_a Array
16
16
  factory.params.size.should == 15
@@ -19,16 +19,29 @@ describe WebCrawler::FactoryUrl do
19
19
  end
20
20
 
21
21
  it "should generate urls with pattern" do
22
- first_param = [1,2,3]
22
+ first_param = [1, 2, 3]
23
23
  second_param = 10...15
24
24
 
25
25
  factory = WebCrawler::FactoryUrl.new("www.example.com/$1/$2.html", first_param, second_param)
26
- urls = factory.factory
26
+ urls = factory.factory
27
27
 
28
28
  urls.should be_a Array
29
- factory.params.size.should == 15
29
+ factory.params.should have(15).items
30
30
  urls.should have(factory.params.size).urls
31
31
  urls.first.should == "www.example.com/1/10.html"
32
32
  end
33
-
33
+
34
+ it "should generate urls with pattern and hash options" do
35
+ pattern = "www.example.com/category_:category/page:page/"
36
+ options = { :page => 1..3, :category => [1, 2, 3, 4] }
37
+
38
+ factory = WebCrawler::FactoryUrl.new(pattern, options)
39
+ urls = factory.factory
40
+
41
+ urls.should be_a Array
42
+ factory.params.should have(12).items
43
+ urls.should have(factory.params.size).urls
44
+ urls.first.should == "www.example.com/category_1/page1/"
45
+ end
46
+
34
47
  end
@@ -7,16 +7,23 @@ describe WebCrawler::Follower do
7
7
  responses = WebCrawler::BatchRequest.new(urls_board_path).process
8
8
  urls = WebCrawler::Follower.new(responses).collect
9
9
 
10
- urls.first.should have(9).urls
11
- urls.first.should == known_urls
10
+ urls.should have(9).urls
11
+ urls.should == known_urls
12
12
  end
13
13
 
14
14
  it "should collect all the unique url with same host like in responses" do
15
15
  responses = WebCrawler::BatchRequest.new(urls_board_path).process
16
16
  urls = WebCrawler::Follower.new(responses, same_host: true).collect
17
17
 
18
- urls.first.should have(6).urls
19
- urls.first.should == known_urls.reject { |u| u =~ /otherhost/ }
18
+ urls.should have(6).urls
19
+ urls.should == known_urls.reject { |u| u =~ /otherhost/ }
20
+ end
21
+
22
+ it "should collect all the unique url like a given regexp" do
23
+ responses = WebCrawler::BatchRequest.new(urls_board_path).process
24
+ urls = WebCrawler::Follower.new(responses, only: /\/\d+\.html/).collect
25
+ urls.should have(2).urls
26
+ urls.should == known_urls.select { |u| u =~ /\/\d+\.html/ }
20
27
  end
21
28
 
22
29
  it "should process requests for following urls" do
@@ -15,8 +15,8 @@ describe WebCrawler::View::Csv do
15
15
  end
16
16
 
17
17
  it "should render input array to csv string with options" do
18
- described_class.new(input, headers: [:title, :url, :author], col_sep: ";").render.should == "title;url;author\n1;2;3\nstring;\"other string\n\"\n"
19
- described_class.new(input, headers: [:title, :url, :author], row_sep: "\n\n").render.should == "title,url,author\n\n1,2,3\n\nstring,\"other string\n\"\n\n"
18
+ described_class.new(input, headers: [:title, :url, :author], csv: {col_sep: ";"}).render.should == "title;url;author\n1;2;3\nstring;\"other string\n\"\n"
19
+ described_class.new(input, headers: [:title, :url, :author], csv: {row_sep: "\n\n"}).render.should == "title,url,author\n\n1,2,3\n\nstring,\"other string\n\"\n\n"
20
20
  end
21
21
 
22
22
  end
@@ -27,15 +27,15 @@ describe WebCrawler::View::Json do
27
27
  let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n", :author=>nil }] }
28
28
 
29
29
  it "should render input array to json string" do
30
- described_class.new(input, headers: [:title, :url, :author]).render.should == '{"responses":[[1,2,"3"],["string","other string\n"]]}'
30
+ described_class.new(input, headers: [:title, :url, :author]).render.should == '[[1,2,"3"],["string","other string\n"]]'
31
31
  end
32
32
 
33
33
  it "should render input hash to json string" do
34
34
  json = described_class.new(input_hash).render
35
- json.should == "{\"responses\":[{\"title\":1,\"url\":2,\"author\":3},{\"title\":\"string\",\"url\":\"other string\\n\",\"author\":null}]}"
36
- hash = JSON.parse(json).symbolize_keys
37
- hash[:responses].each(&:symbolize_keys!)
38
- hash.should == { responses: input_hash }
35
+ json.should == '[{"title":1,"url":2,"author":3},{"title":"string","url":"other string\n","author":null}]'
36
+ hash = JSON.parse(json).map &:symbolize_keys
37
+ hash.each(&:symbolize_keys!)
38
+ hash.should == input_hash
39
39
  end
40
40
  end
41
41
 
@@ -50,7 +50,7 @@ describe WebCrawler::View::Xml do
50
50
  "<response><title>1</title><url>2</url><author>3</author></response>" <<
51
51
  "<response><title>string</title><url>other string\n</url><author></author></response>" <<
52
52
  "</responses>"
53
- described_class.new(input, headers: [:title, :url, :author]).render.should == xml
53
+ described_class.new(input, headers: [:title, :url, :author], pretty: false).render.should == xml
54
54
  end
55
55
 
56
56
  it "should render input array to pretty xml string" do
@@ -58,7 +58,7 @@ describe WebCrawler::View::Xml do
58
58
  "<response><title>1</title><url>2</url><author>3</author></response>\n" <<
59
59
  "<response><title>string</title><url>other string\n</url><author></author></response>\n" <<
60
60
  "</responses>"
61
- described_class.new(input, headers: [:title, :url, :author], pretty: true).render.should == xml
61
+ described_class.new(input, headers: [:title, :url, :author]).render.should == xml
62
62
  end
63
63
 
64
64
  it "should render input array without :headers to xml string" do
@@ -90,6 +90,6 @@ describe WebCrawler::View do
90
90
  output = ""
91
91
  io = StringIO.new(output)
92
92
  WebCrawler::View.factory('json', [[1, 2, 3]]).draw(io)
93
- output.should == "{\"responses\":[[1,2,3]]}\n"
93
+ output.should == "[[1,2,3]]\n"
94
94
  end
95
95
  end
@@ -0,0 +1,143 @@
1
+ require "spec_helper"
2
+
3
+ describe WebCrawler::Base do
4
+ require "fixtures/my_crawler"
5
+
6
+ before(:all) do
7
+ @uri_map = FakeWeb::Registry.instance.uri_map
8
+ FakeWeb.clean_registry
9
+
10
+ MyCrawler.targets.each do |url|
11
+ FakeWeb.register_uri(:get, url, :body => 'spec/fixtures/example.xml', :content_type => "text/html; charset=windows-1251")
12
+ end
13
+ end
14
+
15
+ after(:all) do
16
+ FakeWeb::Registry.instance.uri_map = @uri_map
17
+ end
18
+
19
+ describe ' > ', MyCrawler do
20
+ subject { MyCrawler.new }
21
+
22
+ it "should be instance of MyCrawler" do
23
+ subject.should be_a MyCrawler
24
+ subject.should be_a_kind_of described_class
25
+ end
26
+
27
+ it "should have a target urls" do
28
+ subject.targets.should be_a ::Set
29
+ subject.targets.should have(20).urls
30
+ end
31
+
32
+ it "should generate an urls" do
33
+ pattern = "www.example.com/category_:category/page:page/"
34
+ options = { :category => [1, 2, 3, 4], :page => 1..3 }
35
+ described_class.send(:generate_urls, pattern, options).should == ["www.example.com/category_1/page1/",
36
+ "www.example.com/category_1/page2/",
37
+ "www.example.com/category_1/page3/",
38
+ "www.example.com/category_2/page1/",
39
+ "www.example.com/category_2/page2/",
40
+ "www.example.com/category_2/page3/",
41
+ "www.example.com/category_3/page1/",
42
+ "www.example.com/category_3/page2/",
43
+ "www.example.com/category_3/page3/",
44
+ "www.example.com/category_4/page1/",
45
+ "www.example.com/category_4/page2/",
46
+ "www.example.com/category_4/page3/"]
47
+ end
48
+
49
+ it "logger should be attached to tmp/file.log" do
50
+ subject.logger.should be_a Logger
51
+ subject.logger.instance_variable_get(:@logdev).dev.path.should == '/tmp/file.log'
52
+ end
53
+
54
+ it "logger should be attached to Logger.new(STDERR)" do
55
+ class MyCrawler
56
+ log_to Logger.new(STDERR)
57
+ end
58
+ subject.logger.should be_a Logger
59
+ subject.logger.instance_variable_get(:@logdev).dev.should == STDERR
60
+ end
61
+
62
+ it "cache should be set" do
63
+ WebCrawler.config.cache.adapter.should be_a WebCrawler::CacheAdapter::Base
64
+ end
65
+
66
+ it "follow should collect urls from given url and fill targets" do
67
+ FakeWeb.register_uri(:get, urls_board_path, :body => follower_body)
68
+ FakeWeb.register_uri(:get, 'http://example.com/2323.html', :body => '')
69
+ FakeWeb.register_uri(:get, 'http://example.com/2323.html?rr=1', :body => '')
70
+ class TestCrawler < WebCrawler::Base
71
+ target 'http://example.com/follower' do |targets|
72
+ follow targets, :only => /\/\d+\.html/
73
+ end
74
+ end
75
+ TestCrawler.run
76
+ TestCrawler.targets.should == Set["http://example.com/2323.html", "http://example.com/2323.html?rr=1"]
77
+ end
78
+
79
+ context 'parsing' do
80
+
81
+ context 'context' do
82
+
83
+ it 'should initialize mappers' do
84
+ subject.mappers.should be_a Array
85
+ subject.mappers.should have(1).parser
86
+ subject.mappers.first.should be_a WebCrawler::Parsers::Mapper
87
+ end
88
+
89
+ context 'mapping' do
90
+ subject { MyCrawler.new.mappers.first.mapping }
91
+
92
+ let(:mapping_keys) { ["link",
93
+ "name",
94
+ "region",
95
+ "salary",
96
+ "description",
97
+ "contacts",
98
+ "company",
99
+ "published",
100
+ "expire",
101
+ "catalog item"] }
102
+
103
+ it { should be_a Hash }
104
+ it { should_not be_empty }
105
+ it { subject.keys.should == mapping_keys }
106
+ end
107
+
108
+ context 'run' do
109
+ it 'parse all elements and return Array' do
110
+ result = subject.run
111
+ result.should be_a Hash
112
+ result.keys.first.should == :jobs
113
+ result.values.flatten.should have(100).items
114
+ end
115
+
116
+ it 'parse all elements and return JSON string' do
117
+ result = subject.run :json
118
+ json = JSON.parse(result)
119
+
120
+ result.should be_a String
121
+ result.should =~ /\[{"source_link":/
122
+ json.should be_a Hash
123
+ json.values.flatten.should have(100).items
124
+ end
125
+
126
+ it 'parse all elements and return JSON string' do
127
+ result = subject.run :yaml
128
+ yaml = YAML.load(result)
129
+
130
+ result.should be_a String
131
+ result.should =~ /^---/
132
+ yaml.should be_a Hash
133
+ yaml.values.flatten.should have(100).items
134
+ end
135
+ end
136
+
137
+ end
138
+
139
+ end
140
+
141
+ end
142
+
143
+ end