web_crawler 0.3.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,108 @@
1
+ #coding: utf-8
2
+
3
+ class TestCrawler < WebCrawler::Base
4
+ target 'http://45.ru/job/vacancy/2.php' do |targets|
5
+ follow targets, :only => /\/job\/vacancy\/\d+\.html/
6
+ end
7
+
8
+ cache_to '/tmp/wcrawler_cache'
9
+
10
+ log_to nil
11
+
12
+ context '#block_center > table:first', :vacancies do |table|
13
+ table.search('td.bg_color2').map do |key|
14
+ [key.inner_text.strip, key.search('~').inner_text.strip]
15
+ end
16
+ end
17
+
18
+ protected
19
+
20
+ class <<self
21
+
22
+ def process(*)
23
+ normalize_data(super)
24
+ end
25
+
26
+ def normalize_data(data)
27
+ data[:vacancies].map do |vacancy|
28
+ Hash[vacancy.map { |key, value| [translate_key(key), normalize_value(value)] }]
29
+ end
30
+ end
31
+
32
+ def translate_key(key)
33
+ { "Город" => :city_name,
34
+ "Фирма" => :company_name,
35
+ "График работы" => :schedule,
36
+ "Тип работы" => :employment_type,
37
+ "Зарплата, руб." => :profit,
38
+ "Должность" => :name,
39
+ "Условия" => :conditions,
40
+ "Требования" => :requirements,
41
+ "Обязанности" => :responsibilities,
42
+ "О компании" => :company_description,
43
+ "Знание языков" => :known_languages,
44
+ "Знание компьютера" => :known_computer,
45
+ "Образование" => :education,
46
+ "Место работы (район)" => :place_of_work,
47
+ "Бизнес-образование" => :business_education,
48
+ "Телефон" => :phone,
49
+ "Контактное лицо" => :contact_name,
50
+ "E-mail" => :email,
51
+ "Адрес" => :address,
52
+ "Стаж" => :experience,
53
+ "http://" => :site,
54
+ "url" => :url }[key]
55
+ end
56
+
57
+ def normalize_value(value)
58
+ value.gsub(/\t+/,' ')
59
+ end
60
+
61
+ end
62
+
63
+ end
64
+
65
+ __END__
66
+
67
+
68
+
69
+ Город Курган
70
+ Фирма Розничная сеть Л`Этуаль
71
+ Должность Продавец-консультант ЛЭтуаль
72
+ Зарплата, руб. 20 000
73
+ Форма оплаты Оклад+%
74
+ График работы Полный рабочий день
75
+ Тип работы Постоянная
76
+ Условия Условия работы:
77
+ • Корпоративное обучение
78
+ • График работы сменный;
79
+ • Оформление по Трудовому Кодексу РФ, соц. Пакет
80
+ • Конкурентная заработная плата, оклад + %
81
+
82
+ В компании разработана уникальная программа карьерного роста для наших сотрудников!
83
+ Требования Требования:
84
+ • Образование средне-специальное, высшее
85
+ • Опыт продаж от 1 года,
86
+ • Возраст от 23 до 35 лет, приятный внешний вид
87
+ • Умение находить контакт с любым покупателем
88
+ • Готовность изменить свой имидж в соответствии с корпоративными требованиями компании
89
+ • Высокая работоспособность, активная жизненная позиция
90
+ • Стрессоустойчивость, хорошая память, желание работать и развиваться
91
+ Обязанности Обязанности:
92
+ • Грамотное консультирование клиента по ассортименту
93
+ • Продажа косметики и парфюмерии
94
+ • Работа с кассой
95
+ • Мерчендайзинг
96
+ • Соблюдение и поддержание стандартов компании
97
+ • Поддержание чистоты рабочего места
98
+ О компании Продавец-консультант - это лицо нашей компании. От качества работы, выполняемой им, зависит общий успех – его и компании. Мы декларируем как основные преимущества нашей компании – уникальный дизайн, богатый ассортимент продукции, гибкую систему скидок и специальные предложения, а самое главное – грамотность, профессионализм и вежливость продавцов-консультантов наших магазинов!
99
+
100
+ ОБЯЗАТЕЛЬНО УКАЗЫВАЙТЕ в теме письма "Продавец-консультант г Курган"
101
+ Образование Среднее специальное
102
+ Стаж 1
103
+ Степень ограничения трудоспособности Отсутствует
104
+ Телефон 8-982-602-8331
105
+ Контактное лицо Татьяна Александровна
106
+ E-mail hrm1-svx-ur@letuin.ru
107
+ http:// www.letoile.ru%2F
108
+ Документы для скачивания
@@ -0,0 +1,77 @@
1
+ #encoding: utf-8
2
+
3
+ class TestCrawler2 < WebCrawler::Base
4
+
5
+ target "http://www.superjob.ru/export/vacs_to_xml.php"
6
+
7
+ log_to "/tmp/file.log" # or Logger.new(...)
8
+
9
+ cache_to '/tmp/wcrawler/cache' # or (CacheClass < CacheAdapter).new *args
10
+
11
+ context "job", :jobs do
12
+
13
+ map 'link', :to => :source_link, :on => :inner_text # default :on => :inner_text
14
+ map 'name', :to => :name
15
+ map 'region', :to => :city_name
16
+ map 'salary', :to => :profit
17
+ map 'description', :to => :description, :filter => :format_description
18
+ map 'contacts', :to => :contact_text
19
+ map 'company', :to => :company, :on => [:attr, :id]
20
+ map 'published', :to => :published_at
21
+ map 'expire', :to => :expire_at
22
+ map 'catalog item', :to => :specialization_ids, :on => nil, :filter => :convert_specs
23
+
24
+ end
25
+
26
+ protected
27
+
28
+ def self.format_description(text)
29
+ @titles ||= ["Условия работы и компенсации:\n",
30
+ "Место работы:\n",
31
+ "Должностные обязанности:\n",
32
+ "Требования к квалификации:\n"]
33
+
34
+ text.each_line.inject("") { |new_text, line| new_text << (@titles.include?(line) ? "<h4>#{line.chomp}</h4>\n" : line) }
35
+ end
36
+
37
+ def self.convert_specs(specs)
38
+ @ids_mapping ||= {
39
+ 911 => 4537,
40
+ 1 => 4274,
41
+ 5 => 4335,
42
+ 6 => 4408,
43
+ 16 => [4756, 4545],
44
+ 3 => 4488,
45
+ 9 => 4303,
46
+ 8 => 4649,
47
+ 547 => 4237,
48
+ 579 => 4237,
49
+ 1104 => 4671,
50
+ 10 => 4588,
51
+ 814 => 4568,
52
+ 2 => 4714,
53
+ 11 => 4671,
54
+ 13 => 4691,
55
+ 15 => 4649,
56
+ 17 => 4504,
57
+ 601 => 4428,
58
+ 45 => 4632,
59
+ 22 => 4473,
60
+ 515 => 4524,
61
+ 19 => 4473,
62
+ 20 => 4524,
63
+ 398 => 4749,
64
+ 503 => 4775,
65
+ 941 => 4742,
66
+ 1434 => 4802,
67
+ 2109 => 4537
68
+ }
69
+ specs.map { |i| @ids_mapping[i['thread'].to_i] }.to_a.flatten
70
+ end
71
+
72
+ end
73
+
74
+
75
+ #MyCrawler.run # => return Array
76
+ #MyCrawler.run(:json) # => return String like a JSON object
77
+ #MyCrawler.run(:yaml) # => return String of YAML format
data/spec/spec_helper.rb CHANGED
@@ -9,9 +9,14 @@ require 'fake_web_generator'
9
9
  RSpec.configure do |c|
10
10
  c.mock_with :rspec
11
11
  c.include FakeWebGenerator
12
- end
13
12
 
14
- WebCrawler.configure do
15
- config.cache_adapter = WebCrawler::CacheAdapter::Memory.new
13
+ c.before(:each) do
14
+ WebCrawler.configure do
15
+ config.logger = nil
16
+ config.cache_adapter = WebCrawler::CacheAdapter::Memory.new
17
+ config.logger.level = Logger::ERROR
18
+ end
19
+ end
16
20
  end
17
21
 
22
+
@@ -27,19 +27,8 @@ describe WebCrawler::BatchRequest do
27
27
  end
28
28
 
29
29
  it "should process requests" do
30
- subject.requests.map { |r| r.should_receive(:process).with(no_args).and_return(responses.first) }
31
30
  subject.process.should be_a Array
32
31
  subject.process.first.should be_a WebCrawler::Response
33
32
  end
34
33
 
35
- it "should accept :parser option with parser class or object" do
36
- class ::TestParser
37
- def parse(resp)
38
- resp.to_s + ' parsed'
39
- end
40
- end
41
- described_class.new(urls, parser: TestParser.new).process.should == ["Example body parsed",
42
- "Example body1 parsed",
43
- "Example body for url http://example.com/2 parsed"]
44
- end
45
34
  end
@@ -11,21 +11,27 @@ describe 'Cached requests' do
11
11
  let(:urls) { ['example.com/1', 'example.com/2', 'example.com'] }
12
12
 
13
13
  it 'should not send requests to the web if cache exists' do
14
- FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
15
- first_response = FakeWeb.response_for :get, "http://example.com/1"
14
+ FakeWeb.register_uri(:get, "http://example.com/cached", :body => "cached Example body1")
15
+ first_response = FakeWeb.response_for :get, "http://example.com/cached"
16
16
 
17
- FakeWeb.should_receive(:response_for).with(:get, "http://example.com/1").and_return { first_response }
18
-
19
- lambda {
20
- WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process
21
- }.should raise_error(ArgumentError, /response must be a Net::HTTPResponse/)
17
+ WebCrawler::BatchRequest.new("http://example.com/cached").process
18
+ WebCrawler::config.cache.adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/cached"), first_response))
22
19
 
20
+ cached_response = WebCrawler::config.cache.adapter.get("http://example.com/cached")
23
21
  FakeWeb.should_not_receive(:response_for)
24
22
 
25
- WebCrawler::config.cache_adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/1"), first_response))
26
-
27
- cached_response = WebCrawler::config.cache_adapter.get("http://example.com/1")
28
- WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process.first.should be cached_response
23
+ WebCrawler::BatchRequest.new("http://example.com/cached").process.first.should be cached_response
29
24
  end
30
25
 
26
+ it 'should not be cached' do
27
+ FakeWeb.register_uri(:get, "http://example.com/cached", :body => "cached Example body1")
28
+ first_response = FakeWeb.response_for :get, "http://example.com/cached"
29
+
30
+ WebCrawler::BatchRequest.new("http://example.com/cached").process
31
+ WebCrawler::config.cache.adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/cached"), first_response))
32
+
33
+ cached_response = WebCrawler::config.cache.adapter.get("http://example.com/cached")
34
+
35
+ WebCrawler::BatchRequest.new("http://example.com/cached", no_cached: true).process.first.should_not be cached_response
36
+ end
31
37
  end
@@ -3,14 +3,14 @@ require "spec_helper"
3
3
  describe WebCrawler::FactoryUrl do
4
4
 
5
5
  it "should generate urls with block" do
6
- first_param = [1,2,3]
6
+ first_param = [1, 2, 3]
7
7
  second_param = 10...15
8
8
 
9
9
  factory = WebCrawler::FactoryUrl.new(first_param, second_param) do |*args|
10
10
  random = rand(3000)
11
11
  "www.example.com/%s/%s.html?rid=#{random}" % args
12
12
  end
13
- urls = factory.factory
13
+ urls = factory.factory
14
14
 
15
15
  urls.should be_a Array
16
16
  factory.params.size.should == 15
@@ -19,16 +19,29 @@ describe WebCrawler::FactoryUrl do
19
19
  end
20
20
 
21
21
  it "should generate urls with pattern" do
22
- first_param = [1,2,3]
22
+ first_param = [1, 2, 3]
23
23
  second_param = 10...15
24
24
 
25
25
  factory = WebCrawler::FactoryUrl.new("www.example.com/$1/$2.html", first_param, second_param)
26
- urls = factory.factory
26
+ urls = factory.factory
27
27
 
28
28
  urls.should be_a Array
29
- factory.params.size.should == 15
29
+ factory.params.should have(15).items
30
30
  urls.should have(factory.params.size).urls
31
31
  urls.first.should == "www.example.com/1/10.html"
32
32
  end
33
-
33
+
34
+ it "should generate urls with pattern and hash options" do
35
+ pattern = "www.example.com/category_:category/page:page/"
36
+ options = { :page => 1..3, :category => [1, 2, 3, 4] }
37
+
38
+ factory = WebCrawler::FactoryUrl.new(pattern, options)
39
+ urls = factory.factory
40
+
41
+ urls.should be_a Array
42
+ factory.params.should have(12).items
43
+ urls.should have(factory.params.size).urls
44
+ urls.first.should == "www.example.com/category_1/page1/"
45
+ end
46
+
34
47
  end
@@ -7,16 +7,23 @@ describe WebCrawler::Follower do
7
7
  responses = WebCrawler::BatchRequest.new(urls_board_path).process
8
8
  urls = WebCrawler::Follower.new(responses).collect
9
9
 
10
- urls.first.should have(9).urls
11
- urls.first.should == known_urls
10
+ urls.should have(9).urls
11
+ urls.should == known_urls
12
12
  end
13
13
 
14
14
  it "should collect all the unique url with same host like in responses" do
15
15
  responses = WebCrawler::BatchRequest.new(urls_board_path).process
16
16
  urls = WebCrawler::Follower.new(responses, same_host: true).collect
17
17
 
18
- urls.first.should have(6).urls
19
- urls.first.should == known_urls.reject { |u| u =~ /otherhost/ }
18
+ urls.should have(6).urls
19
+ urls.should == known_urls.reject { |u| u =~ /otherhost/ }
20
+ end
21
+
22
+ it "should collect all the unique url like a given regexp" do
23
+ responses = WebCrawler::BatchRequest.new(urls_board_path).process
24
+ urls = WebCrawler::Follower.new(responses, only: /\/\d+\.html/).collect
25
+ urls.should have(2).urls
26
+ urls.should == known_urls.select { |u| u =~ /\/\d+\.html/ }
20
27
  end
21
28
 
22
29
  it "should process requests for following urls" do
@@ -15,8 +15,8 @@ describe WebCrawler::View::Csv do
15
15
  end
16
16
 
17
17
  it "should render input array to csv string with options" do
18
- described_class.new(input, headers: [:title, :url, :author], col_sep: ";").render.should == "title;url;author\n1;2;3\nstring;\"other string\n\"\n"
19
- described_class.new(input, headers: [:title, :url, :author], row_sep: "\n\n").render.should == "title,url,author\n\n1,2,3\n\nstring,\"other string\n\"\n\n"
18
+ described_class.new(input, headers: [:title, :url, :author], csv: {col_sep: ";"}).render.should == "title;url;author\n1;2;3\nstring;\"other string\n\"\n"
19
+ described_class.new(input, headers: [:title, :url, :author], csv: {row_sep: "\n\n"}).render.should == "title,url,author\n\n1,2,3\n\nstring,\"other string\n\"\n\n"
20
20
  end
21
21
 
22
22
  end
@@ -27,15 +27,15 @@ describe WebCrawler::View::Json do
27
27
  let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n", :author=>nil }] }
28
28
 
29
29
  it "should render input array to json string" do
30
- described_class.new(input, headers: [:title, :url, :author]).render.should == '{"responses":[[1,2,"3"],["string","other string\n"]]}'
30
+ described_class.new(input, headers: [:title, :url, :author]).render.should == '[[1,2,"3"],["string","other string\n"]]'
31
31
  end
32
32
 
33
33
  it "should render input hash to json string" do
34
34
  json = described_class.new(input_hash).render
35
- json.should == "{\"responses\":[{\"title\":1,\"url\":2,\"author\":3},{\"title\":\"string\",\"url\":\"other string\\n\",\"author\":null}]}"
36
- hash = JSON.parse(json).symbolize_keys
37
- hash[:responses].each(&:symbolize_keys!)
38
- hash.should == { responses: input_hash }
35
+ json.should == '[{"title":1,"url":2,"author":3},{"title":"string","url":"other string\n","author":null}]'
36
+ hash = JSON.parse(json).map &:symbolize_keys
37
+ hash.each(&:symbolize_keys!)
38
+ hash.should == input_hash
39
39
  end
40
40
  end
41
41
 
@@ -50,7 +50,7 @@ describe WebCrawler::View::Xml do
50
50
  "<response><title>1</title><url>2</url><author>3</author></response>" <<
51
51
  "<response><title>string</title><url>other string\n</url><author></author></response>" <<
52
52
  "</responses>"
53
- described_class.new(input, headers: [:title, :url, :author]).render.should == xml
53
+ described_class.new(input, headers: [:title, :url, :author], pretty: false).render.should == xml
54
54
  end
55
55
 
56
56
  it "should render input array to pretty xml string" do
@@ -58,7 +58,7 @@ describe WebCrawler::View::Xml do
58
58
  "<response><title>1</title><url>2</url><author>3</author></response>\n" <<
59
59
  "<response><title>string</title><url>other string\n</url><author></author></response>\n" <<
60
60
  "</responses>"
61
- described_class.new(input, headers: [:title, :url, :author], pretty: true).render.should == xml
61
+ described_class.new(input, headers: [:title, :url, :author]).render.should == xml
62
62
  end
63
63
 
64
64
  it "should render input array without :headers to xml string" do
@@ -90,6 +90,6 @@ describe WebCrawler::View do
90
90
  output = ""
91
91
  io = StringIO.new(output)
92
92
  WebCrawler::View.factory('json', [[1, 2, 3]]).draw(io)
93
- output.should == "{\"responses\":[[1,2,3]]}\n"
93
+ output.should == "[[1,2,3]]\n"
94
94
  end
95
95
  end
@@ -0,0 +1,143 @@
1
+ require "spec_helper"
2
+
3
+ describe WebCrawler::Base do
4
+ require "fixtures/my_crawler"
5
+
6
+ before(:all) do
7
+ @uri_map = FakeWeb::Registry.instance.uri_map
8
+ FakeWeb.clean_registry
9
+
10
+ MyCrawler.targets.each do |url|
11
+ FakeWeb.register_uri(:get, url, :body => 'spec/fixtures/example.xml', :content_type => "text/html; charset=windows-1251")
12
+ end
13
+ end
14
+
15
+ after(:all) do
16
+ FakeWeb::Registry.instance.uri_map = @uri_map
17
+ end
18
+
19
+ describe ' > ', MyCrawler do
20
+ subject { MyCrawler.new }
21
+
22
+ it "should be instance of MyCrawler" do
23
+ subject.should be_a MyCrawler
24
+ subject.should be_a_kind_of described_class
25
+ end
26
+
27
+ it "should have a target urls" do
28
+ subject.targets.should be_a ::Set
29
+ subject.targets.should have(20).urls
30
+ end
31
+
32
+ it "should generate an urls" do
33
+ pattern = "www.example.com/category_:category/page:page/"
34
+ options = { :category => [1, 2, 3, 4], :page => 1..3 }
35
+ described_class.send(:generate_urls, pattern, options).should == ["www.example.com/category_1/page1/",
36
+ "www.example.com/category_1/page2/",
37
+ "www.example.com/category_1/page3/",
38
+ "www.example.com/category_2/page1/",
39
+ "www.example.com/category_2/page2/",
40
+ "www.example.com/category_2/page3/",
41
+ "www.example.com/category_3/page1/",
42
+ "www.example.com/category_3/page2/",
43
+ "www.example.com/category_3/page3/",
44
+ "www.example.com/category_4/page1/",
45
+ "www.example.com/category_4/page2/",
46
+ "www.example.com/category_4/page3/"]
47
+ end
48
+
49
+ it "logger should be attached to tmp/file.log" do
50
+ subject.logger.should be_a Logger
51
+ subject.logger.instance_variable_get(:@logdev).dev.path.should == '/tmp/file.log'
52
+ end
53
+
54
+ it "logger should be attached to Logger.new(STDERR)" do
55
+ class MyCrawler
56
+ log_to Logger.new(STDERR)
57
+ end
58
+ subject.logger.should be_a Logger
59
+ subject.logger.instance_variable_get(:@logdev).dev.should == STDERR
60
+ end
61
+
62
+ it "cache should be set" do
63
+ WebCrawler.config.cache.adapter.should be_a WebCrawler::CacheAdapter::Base
64
+ end
65
+
66
+ it "follow should collect urls from given url and fill targets" do
67
+ FakeWeb.register_uri(:get, urls_board_path, :body => follower_body)
68
+ FakeWeb.register_uri(:get, 'http://example.com/2323.html', :body => '')
69
+ FakeWeb.register_uri(:get, 'http://example.com/2323.html?rr=1', :body => '')
70
+ class TestCrawler < WebCrawler::Base
71
+ target 'http://example.com/follower' do |targets|
72
+ follow targets, :only => /\/\d+\.html/
73
+ end
74
+ end
75
+ TestCrawler.run
76
+ TestCrawler.targets.should == Set["http://example.com/2323.html", "http://example.com/2323.html?rr=1"]
77
+ end
78
+
79
+ context 'parsing' do
80
+
81
+ context 'context' do
82
+
83
+ it 'should initialize mappers' do
84
+ subject.mappers.should be_a Array
85
+ subject.mappers.should have(1).parser
86
+ subject.mappers.first.should be_a WebCrawler::Parsers::Mapper
87
+ end
88
+
89
+ context 'mapping' do
90
+ subject { MyCrawler.new.mappers.first.mapping }
91
+
92
+ let(:mapping_keys) { ["link",
93
+ "name",
94
+ "region",
95
+ "salary",
96
+ "description",
97
+ "contacts",
98
+ "company",
99
+ "published",
100
+ "expire",
101
+ "catalog item"] }
102
+
103
+ it { should be_a Hash }
104
+ it { should_not be_empty }
105
+ it { subject.keys.should == mapping_keys }
106
+ end
107
+
108
+ context 'run' do
109
+ it 'parse all elements and return Array' do
110
+ result = subject.run
111
+ result.should be_a Hash
112
+ result.keys.first.should == :jobs
113
+ result.values.flatten.should have(100).items
114
+ end
115
+
116
+ it 'parse all elements and return JSON string' do
117
+ result = subject.run :json
118
+ json = JSON.parse(result)
119
+
120
+ result.should be_a String
121
+ result.should =~ /\[{"source_link":/
122
+ json.should be_a Hash
123
+ json.values.flatten.should have(100).items
124
+ end
125
+
126
+ it 'parse all elements and return JSON string' do
127
+ result = subject.run :yaml
128
+ yaml = YAML.load(result)
129
+
130
+ result.should be_a String
131
+ result.should =~ /^---/
132
+ yaml.should be_a Hash
133
+ yaml.values.flatten.should have(100).items
134
+ end
135
+ end
136
+
137
+ end
138
+
139
+ end
140
+
141
+ end
142
+
143
+ end