RubyGems - web_crawler - Versions diffs - 0.3.1 → 0.5.0 - Mend

web_crawler 0.3.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

data/Gemfile +2 -0
data/README +22 -1
data/lib/web_crawler.rb +2 -0
data/lib/web_crawler/application.rb +33 -2
data/lib/web_crawler/base.rb +113 -0
data/lib/web_crawler/batch_request.rb +10 -4
data/lib/web_crawler/cached_request.rb +16 -7
data/lib/web_crawler/configuration.rb +5 -5
data/lib/web_crawler/factory_url.rb +27 -7
data/lib/web_crawler/follower.rb +11 -9
data/lib/web_crawler/parsers.rb +1 -0
data/lib/web_crawler/parsers/mapper.rb +114 -0
data/lib/web_crawler/parsers/url.rb +3 -5
data/lib/web_crawler/request.rb +14 -2
data/lib/web_crawler/response.rb +2 -2
data/lib/web_crawler/version.rb +2 -2
data/lib/web_crawler/view.rb +1 -1
data/lib/web_crawler/view/csv.rb +1 -1
data/lib/web_crawler/view/json.rb +1 -1
data/lib/web_crawler/view/yaml.rb +1 -1
data/spec/fixtures/example.xml +171 -0
data/spec/fixtures/my_crawler.rb +82 -0
data/spec/fixtures/test_crawler.rb +108 -0
data/spec/fixtures/test_crawler2.rb +77 -0
data/spec/spec_helper.rb +8 -3
data/spec/web_crawler/batch_request_spec.rb +0 -11
data/spec/web_crawler/cached_request_spec.rb +17 -11
data/spec/web_crawler/factory_url_spec.rb +19 -6
data/spec/web_crawler/follow_spec.rb +11 -4
data/spec/web_crawler/view_spec.rb +10 -10
data/spec/web_crawler/web_crawler_api_base_class_spec.rb +143 -0
data/web_crawler.gemspec +2 -0
metadata +43 -8

data/spec/fixtures/test_crawler.rb ADDED Viewed

@@ -0,0 +1,108 @@
+#coding: utf-8
+class TestCrawler < WebCrawler::Base
+  target 'http://45.ru/job/vacancy/2.php' do |targets|
+    follow targets, :only => /\/job\/vacancy\/\d+\.html/
+  end
+  cache_to '/tmp/wcrawler_cache'
+  log_to nil
+  context '#block_center > table:first', :vacancies do |table|
+    table.search('td.bg_color2').map do |key|
+      [key.inner_text.strip, key.search('~').inner_text.strip]
+    end
+  end
+  protected
+  class <<self
+    def process(*)
+      normalize_data(super)
+    end
+    def normalize_data(data)
+      data[:vacancies].map do |vacancy|
+        Hash[vacancy.map { |key, value| [translate_key(key), normalize_value(value)] }]
+      end
+    end
+    def translate_key(key)
+      { "Город"                => :city_name,
+        "Фирма"                => :company_name,
+        "График работы"        => :schedule,
+        "Тип работы"           => :employment_type,
+        "Зарплата, руб."       => :profit,
+        "Должность"            => :name,
+        "Условия"              => :conditions,
+        "Требования"           => :requirements,
+        "Обязанности"          => :responsibilities,
+        "О компании"           => :company_description,
+        "Знание языков"        => :known_languages,
+        "Знание компьютера"    => :known_computer,
+        "Образование"          => :education,
+        "Место работы (район)" => :place_of_work,
+        "Бизнес-образование"   => :business_education,
+        "Телефон"              => :phone,
+        "Контактное лицо"      => :contact_name,
+        "E-mail"               => :email,
+        "Адрес"                => :address,
+        "Стаж"                 => :experience,
+        "http://"              => :site,
+        "url"                  => :url }[key]
+    end
+    def normalize_value(value)
+      value.gsub(/\t+/,' ')
+    end
+  end
+end
+__END__
+Город	 Курган
+Фирма	 Розничная сеть Л`Этуаль
+Должность	Продавец-консультант ЛЭтуаль
+Зарплата, руб.	 20 000
+Форма оплаты	Оклад+%
+График работы	Полный рабочий день
+Тип работы	 Постоянная
+Условия	Условия работы:
+•	Корпоративное обучение
+•	График работы сменный;
+•	Оформление по Трудовому Кодексу РФ, соц. Пакет
+•	Конкурентная заработная плата, оклад + %
+В компании разработана уникальная программа карьерного роста для наших сотрудников!
+Требования	Требования:
+•	Образование средне-специальное, высшее
+•	Опыт продаж от 1 года,
+•	Возраст от 23 до 35 лет, приятный внешний вид
+•	Умение находить контакт с любым покупателем
+•	Готовность изменить свой имидж в соответствии с корпоративными требованиями компании
+•	Высокая работоспособность, активная жизненная позиция
+•	Стрессоустойчивость, хорошая память, желание работать и развиваться
+Обязанности	Обязанности:
+•	Грамотное консультирование клиента по ассортименту
+•	Продажа косметики и парфюмерии
+•	Работа с кассой
+•	Мерчендайзинг
+•	Соблюдение и поддержание стандартов компании
+•	Поддержание чистоты рабочего места
+О компании	Продавец-консультант - это лицо нашей компании. От качества работы, выполняемой им, зависит общий успех – его и компании. Мы декларируем как основные преимущества нашей компании – уникальный дизайн, богатый ассортимент продукции, гибкую систему скидок и специальные предложения, а самое главное – грамотность, профессионализм и вежливость продавцов-консультантов наших магазинов!
+ОБЯЗАТЕЛЬНО УКАЗЫВАЙТЕ в теме письма "Продавец-консультант г Курган"
+Образование	Среднее специальное
+Стаж	1
+Степень ограничения трудоспособности	Отсутствует
+Телефон	8-982-602-8331
+Контактное лицо	Татьяна Александровна
+E-mail	hrm1-svx-ur@letuin.ru
+http://	www.letoile.ru%2F
+Документы для скачивания

data/spec/fixtures/test_crawler2.rb ADDED Viewed

@@ -0,0 +1,77 @@
+#encoding: utf-8
+class TestCrawler2 < WebCrawler::Base
+  target "http://www.superjob.ru/export/vacs_to_xml.php"
+  log_to "/tmp/file.log" # or Logger.new(...)
+  cache_to '/tmp/wcrawler/cache' # or (CacheClass < CacheAdapter).new *args
+  context "job", :jobs do
+    map 'link', :to => :source_link, :on => :inner_text # default :on => :inner_text
+    map 'name', :to => :name
+    map 'region', :to => :city_name
+    map 'salary', :to => :profit
+    map 'description', :to => :description, :filter => :format_description
+    map 'contacts', :to => :contact_text
+    map 'company', :to => :company, :on => [:attr, :id]
+    map 'published', :to => :published_at
+    map 'expire', :to => :expire_at
+    map 'catalog item', :to => :specialization_ids, :on => nil, :filter => :convert_specs
+  end
+  protected
+  def self.format_description(text)
+    @titles ||= ["Условия работы и компенсации:\n",
+                 "Место работы:\n",
+                 "Должностные обязанности:\n",
+                 "Требования к квалификации:\n"]
+    text.each_line.inject("") { |new_text, line| new_text << (@titles.include?(line) ? "<h4>#{line.chomp}</h4>\n" : line) }
+  end
+  def self.convert_specs(specs)
+    @ids_mapping ||= {
+        911  => 4537,
+        1    => 4274,
+        5    => 4335,
+        6    => 4408,
+        16   => [4756, 4545],
+        3    => 4488,
+        9    => 4303,
+        8    => 4649,
+        547  => 4237,
+        579  => 4237,
+        1104 => 4671,
+        10   => 4588,
+        814  => 4568,
+        2    => 4714,
+        11   => 4671,
+        13   => 4691,
+        15   => 4649,
+        17   => 4504,
+        601  => 4428,
+        45   => 4632,
+        22   => 4473,
+        515  => 4524,
+        19   => 4473,
+        20   => 4524,
+        398  => 4749,
+        503  => 4775,
+        941  => 4742,
+        1434 => 4802,
+        2109 => 4537
+    }
+    specs.map { |i| @ids_mapping[i['thread'].to_i] }.to_a.flatten
+  end
+end
+#MyCrawler.run        # => return Array
+#MyCrawler.run(:json) # => return String like a JSON object
+#MyCrawler.run(:yaml) # => return String of YAML format

data/spec/spec_helper.rb CHANGED Viewed

@@ -9,9 +9,14 @@ require 'fake_web_generator'
 RSpec.configure do |c|
   c.mock_with :rspec
   c.include FakeWebGenerator
-end
-WebCrawler.configure do
-  config.cache_adapter = WebCrawler::CacheAdapter::Memory.new
+  c.before(:each) do
+    WebCrawler.configure do
+      config.logger        = nil
+      config.cache_adapter = WebCrawler::CacheAdapter::Memory.new
+      config.logger.level = Logger::ERROR
+    end
+  end
 end

data/spec/web_crawler/batch_request_spec.rb CHANGED Viewed

@@ -27,19 +27,8 @@ describe WebCrawler::BatchRequest do
   end
   it "should process requests" do
-    subject.requests.map { |r| r.should_receive(:process).with(no_args).and_return(responses.first) }
     subject.process.should be_a Array
     subject.process.first.should be_a WebCrawler::Response
   end
-  it "should accept :parser option with parser class or object" do
-    class ::TestParser
-      def parse(resp)
-        resp.to_s + ' parsed'
-      end
-    end
-    described_class.new(urls, parser: TestParser.new).process.should == ["Example body parsed",
-                                                                         "Example body1 parsed",
-                                                                         "Example body for url http://example.com/2 parsed"]
-  end
 end

data/spec/web_crawler/cached_request_spec.rb CHANGED Viewed

@@ -11,21 +11,27 @@ describe 'Cached requests' do
   let(:urls) { ['example.com/1', 'example.com/2', 'example.com'] }
   it 'should not send requests to the web if cache exists' do
-    FakeWeb.register_uri(:get, "http://example.com/1", :body => "Example body1")
-    first_response = FakeWeb.response_for :get, "http://example.com/1"
+    FakeWeb.register_uri(:get, "http://example.com/cached", :body => "cached Example body1")
+    first_response = FakeWeb.response_for :get, "http://example.com/cached"
-    FakeWeb.should_receive(:response_for).with(:get, "http://example.com/1").and_return { first_response }
-    lambda {
-      WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process
-    }.should raise_error(ArgumentError, /response must be a Net::HTTPResponse/)
+    WebCrawler::BatchRequest.new("http://example.com/cached").process
+    WebCrawler::config.cache.adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/cached"), first_response))
+    cached_response = WebCrawler::config.cache.adapter.get("http://example.com/cached")
     FakeWeb.should_not_receive(:response_for)
-    WebCrawler::config.cache_adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/1"), first_response))
-    cached_response = WebCrawler::config.cache_adapter.get("http://example.com/1")
-    WebCrawler::BatchRequest.new("http://example.com/1", cached: true).process.first.should be cached_response
+    WebCrawler::BatchRequest.new("http://example.com/cached").process.first.should be cached_response
   end
+  it 'should not be cached' do
+    FakeWeb.register_uri(:get, "http://example.com/cached", :body => "cached Example body1")
+    first_response = FakeWeb.response_for :get, "http://example.com/cached"
+    WebCrawler::BatchRequest.new("http://example.com/cached").process
+    WebCrawler::config.cache.adapter.put(WebCrawler::Response.new(URI.parse("http://example.com/cached"), first_response))
+    cached_response = WebCrawler::config.cache.adapter.get("http://example.com/cached")
+    WebCrawler::BatchRequest.new("http://example.com/cached", no_cached: true).process.first.should_not be cached_response
+  end
 end

data/spec/web_crawler/factory_url_spec.rb CHANGED Viewed

@@ -3,14 +3,14 @@ require "spec_helper"
 describe WebCrawler::FactoryUrl do
   it "should generate urls with block" do
-    first_param = [1,2,3]
+    first_param  = [1, 2, 3]
     second_param = 10...15
     factory = WebCrawler::FactoryUrl.new(first_param, second_param) do |*args|
       random = rand(3000)
       "www.example.com/%s/%s.html?rid=#{random}" % args
     end
-    urls = factory.factory
+    urls    = factory.factory
     urls.should be_a Array
     factory.params.size.should == 15
@@ -19,16 +19,29 @@ describe WebCrawler::FactoryUrl do
   end
   it "should generate urls with pattern" do
-    first_param = [1,2,3]
+    first_param  = [1, 2, 3]
     second_param = 10...15
     factory = WebCrawler::FactoryUrl.new("www.example.com/$1/$2.html", first_param, second_param)
-    urls = factory.factory
+    urls    = factory.factory
     urls.should be_a Array
-    factory.params.size.should == 15
+    factory.params.should have(15).items
     urls.should have(factory.params.size).urls
     urls.first.should == "www.example.com/1/10.html"
   end
+  it "should generate urls with pattern and hash options" do
+    pattern = "www.example.com/category_:category/page:page/"
+    options = { :page => 1..3, :category => [1, 2, 3, 4] }
+    factory = WebCrawler::FactoryUrl.new(pattern, options)
+    urls    = factory.factory
+    urls.should be_a Array
+    factory.params.should have(12).items
+    urls.should have(factory.params.size).urls
+    urls.first.should == "www.example.com/category_1/page1/"
+  end
 end

data/spec/web_crawler/follow_spec.rb CHANGED Viewed

@@ -7,16 +7,23 @@ describe WebCrawler::Follower do
     responses = WebCrawler::BatchRequest.new(urls_board_path).process
     urls      = WebCrawler::Follower.new(responses).collect
-    urls.first.should have(9).urls
-    urls.first.should == known_urls
+    urls.should have(9).urls
+    urls.should == known_urls
   end
   it "should collect all the unique url with same host like in responses" do
     responses = WebCrawler::BatchRequest.new(urls_board_path).process
     urls      = WebCrawler::Follower.new(responses, same_host: true).collect
-    urls.first.should have(6).urls
-    urls.first.should == known_urls.reject { |u| u =~ /otherhost/ }
+    urls.should have(6).urls
+    urls.should == known_urls.reject { |u| u =~ /otherhost/ }
+  end
+  it "should collect all the unique url like a given regexp" do
+    responses = WebCrawler::BatchRequest.new(urls_board_path).process
+    urls      = WebCrawler::Follower.new(responses, only: /\/\d+\.html/).collect
+    urls.should have(2).urls
+    urls.should == known_urls.select { |u| u =~ /\/\d+\.html/ }
   end
   it "should process requests for following urls" do

data/spec/web_crawler/view_spec.rb CHANGED Viewed

@@ -15,8 +15,8 @@ describe WebCrawler::View::Csv do
   end
   it "should render input array to csv string with options" do
-    described_class.new(input, headers: [:title, :url, :author], col_sep: ";").render.should == "title;url;author\n1;2;3\nstring;\"other string\n\"\n"
-    described_class.new(input, headers: [:title, :url, :author], row_sep: "\n\n").render.should == "title,url,author\n\n1,2,3\n\nstring,\"other string\n\"\n\n"
+    described_class.new(input, headers: [:title, :url, :author], csv: {col_sep: ";"}).render.should == "title;url;author\n1;2;3\nstring;\"other string\n\"\n"
+    described_class.new(input, headers: [:title, :url, :author], csv: {row_sep: "\n\n"}).render.should == "title,url,author\n\n1,2,3\n\nstring,\"other string\n\"\n\n"
   end
 end
@@ -27,15 +27,15 @@ describe WebCrawler::View::Json do
   let(:input_hash) { [{ :title=>1, :url=>2, :author=>3 }, { :title=>"string", :url=>"other string\n", :author=>nil }] }
   it "should render input array to json string" do
-    described_class.new(input, headers: [:title, :url, :author]).render.should == '{"responses":[[1,2,"3"],["string","other string\n"]]}'
+    described_class.new(input, headers: [:title, :url, :author]).render.should == '[[1,2,"3"],["string","other string\n"]]'
   end
   it "should render input hash to json string" do
     json = described_class.new(input_hash).render
-    json.should == "{\"responses\":[{\"title\":1,\"url\":2,\"author\":3},{\"title\":\"string\",\"url\":\"other string\\n\",\"author\":null}]}"
-    hash = JSON.parse(json).symbolize_keys
-    hash[:responses].each(&:symbolize_keys!)
-    hash.should == { responses: input_hash }
+    json.should == '[{"title":1,"url":2,"author":3},{"title":"string","url":"other string\n","author":null}]'
+    hash = JSON.parse(json).map &:symbolize_keys
+    hash.each(&:symbolize_keys!)
+    hash.should == input_hash
   end
 end
@@ -50,7 +50,7 @@ describe WebCrawler::View::Xml do
         "<response><title>1</title><url>2</url><author>3</author></response>" <<
         "<response><title>string</title><url>other string\n</url><author></author></response>" <<
         "</responses>"
-    described_class.new(input, headers: [:title, :url, :author]).render.should == xml
+    described_class.new(input, headers: [:title, :url, :author], pretty: false).render.should == xml
   end
   it "should render input array to pretty xml string" do
@@ -58,7 +58,7 @@ describe WebCrawler::View::Xml do
         "<response><title>1</title><url>2</url><author>3</author></response>\n" <<
         "<response><title>string</title><url>other string\n</url><author></author></response>\n" <<
         "</responses>"
-    described_class.new(input, headers: [:title, :url, :author], pretty: true).render.should == xml
+    described_class.new(input, headers: [:title, :url, :author]).render.should == xml
   end
   it "should render input array without :headers to xml string" do
@@ -90,6 +90,6 @@ describe WebCrawler::View do
     output = ""
     io = StringIO.new(output)
     WebCrawler::View.factory('json', [[1, 2, 3]]).draw(io)
-    output.should == "{\"responses\":[[1,2,3]]}\n"
+    output.should == "[[1,2,3]]\n"
   end
 end

data/spec/web_crawler/web_crawler_api_base_class_spec.rb ADDED Viewed

@@ -0,0 +1,143 @@
+require "spec_helper"
+describe WebCrawler::Base do
+  require "fixtures/my_crawler"
+  before(:all) do
+    @uri_map = FakeWeb::Registry.instance.uri_map
+    FakeWeb.clean_registry
+    MyCrawler.targets.each do |url|
+      FakeWeb.register_uri(:get, url, :body => 'spec/fixtures/example.xml', :content_type => "text/html; charset=windows-1251")
+    end
+  end
+  after(:all) do
+    FakeWeb::Registry.instance.uri_map = @uri_map
+  end
+  describe ' > ', MyCrawler do
+    subject { MyCrawler.new }
+    it "should be instance of MyCrawler" do
+      subject.should be_a MyCrawler
+      subject.should be_a_kind_of described_class
+    end
+    it "should have a target urls" do
+      subject.targets.should be_a ::Set
+      subject.targets.should have(20).urls
+    end
+    it "should generate an urls" do
+      pattern = "www.example.com/category_:category/page:page/"
+      options = { :category => [1, 2, 3, 4], :page => 1..3 }
+      described_class.send(:generate_urls, pattern, options).should == ["www.example.com/category_1/page1/",
+                                                                        "www.example.com/category_1/page2/",
+                                                                        "www.example.com/category_1/page3/",
+                                                                        "www.example.com/category_2/page1/",
+                                                                        "www.example.com/category_2/page2/",
+                                                                        "www.example.com/category_2/page3/",
+                                                                        "www.example.com/category_3/page1/",
+                                                                        "www.example.com/category_3/page2/",
+                                                                        "www.example.com/category_3/page3/",
+                                                                        "www.example.com/category_4/page1/",
+                                                                        "www.example.com/category_4/page2/",
+                                                                        "www.example.com/category_4/page3/"]
+    end
+    it "logger should be attached to tmp/file.log" do
+      subject.logger.should be_a Logger
+      subject.logger.instance_variable_get(:@logdev).dev.path.should == '/tmp/file.log'
+    end
+    it "logger should be attached to Logger.new(STDERR)" do
+      class MyCrawler
+        log_to Logger.new(STDERR)
+      end
+      subject.logger.should be_a Logger
+      subject.logger.instance_variable_get(:@logdev).dev.should == STDERR
+    end
+    it "cache should be set" do
+      WebCrawler.config.cache.adapter.should be_a WebCrawler::CacheAdapter::Base
+    end
+    it "follow should collect urls from given url and fill targets" do
+      FakeWeb.register_uri(:get, urls_board_path, :body => follower_body)
+      FakeWeb.register_uri(:get, 'http://example.com/2323.html', :body => '')
+      FakeWeb.register_uri(:get, 'http://example.com/2323.html?rr=1', :body => '')
+      class TestCrawler < WebCrawler::Base
+        target 'http://example.com/follower' do |targets|
+          follow targets, :only => /\/\d+\.html/
+        end
+      end
+      TestCrawler.run
+      TestCrawler.targets.should == Set["http://example.com/2323.html", "http://example.com/2323.html?rr=1"]
+    end
+    context 'parsing' do
+      context 'context' do
+        it 'should initialize mappers' do
+          subject.mappers.should be_a Array
+          subject.mappers.should have(1).parser
+          subject.mappers.first.should be_a WebCrawler::Parsers::Mapper
+        end
+        context 'mapping' do
+          subject { MyCrawler.new.mappers.first.mapping }
+          let(:mapping_keys) { ["link",
+                                "name",
+                                "region",
+                                "salary",
+                                "description",
+                                "contacts",
+                                "company",
+                                "published",
+                                "expire",
+                                "catalog item"] }
+          it { should be_a Hash }
+          it { should_not be_empty }
+          it { subject.keys.should == mapping_keys }
+        end
+        context 'run' do
+          it 'parse all elements and return Array' do
+            result = subject.run
+            result.should be_a Hash
+            result.keys.first.should == :jobs
+            result.values.flatten.should have(100).items
+          end
+          it 'parse all elements and return JSON string' do
+            result = subject.run :json
+            json   = JSON.parse(result)
+            result.should be_a String
+            result.should =~ /\[{"source_link":/
+            json.should be_a Hash
+            json.values.flatten.should have(100).items
+          end
+          it 'parse all elements and return JSON string' do
+            result = subject.run :yaml
+            yaml   = YAML.load(result)
+            result.should be_a String
+            result.should =~ /^---/
+            yaml.should be_a Hash
+            yaml.values.flatten.should have(100).items
+          end
+        end
+      end
+    end
+  end
+end