roro_crawler 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +6 -1
- data/lib/roro_crawler.rb +6 -4
- data/lib/roro_crawler/base.rb +40 -0
- data/lib/roro_crawler/methods.rb +18 -0
- data/lib/roro_crawler/version.rb +2 -2
- data/spec/roro_crawler/base_spec.rb +68 -0
- data/spec/roro_crawler_spec.rb +1 -0
- data/spec/spec_helper.rb +14 -10
- metadata +65 -10
- data/lib/roro_crawler/crawler_class.rb +0 -98
- data/lib/roro_crawler/crawler_handler.rb +0 -13
- data/lib/tasks/crawler_tasks.rake +0 -4
- data/spec/crawler_class_spec.rb +0 -23
- data/spec/crawler_handler_spec.rb +0 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9d5915884d8cf5013726ac53c6373dab6061f984
|
4
|
+
data.tar.gz: a83f6c9f6f7019b7d58a08588e63989b1fbf6ca9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b3b91e75d2b5cf527ec7c2bdd7d783ece4002fbf6374403dbe4eeeed551d12488ee7487b044602b585e71cf3f954a8a6702df5d5fd22fc4af08c8dbddae9b03e
|
7
|
+
data.tar.gz: a84a083837e112f18403a43e60b162b2430d2cb326614a2cee2578bb2d8849905dd386abc7348ff91797dfb052987f12b99a8d0a6d7c44c77c174dc196602279
|
data/Rakefile
CHANGED
data/lib/roro_crawler.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# require all files in dir name is same with __FILE_-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
2
|
+
require 'roro_support'
|
3
|
+
require 'headless'
|
4
|
+
|
5
|
+
$LOAD_PATH.unshift File.expand_path('../roro_crawler', __FILE__)
|
6
|
+
require 'methods'
|
7
|
+
require 'base'
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module RoRoCrawler
|
2
|
+
class Base
|
3
|
+
|
4
|
+
include RoRoSupport::Crawler
|
5
|
+
|
6
|
+
def spider(url, auchor_selector, intr_selector)
|
7
|
+
@url = url
|
8
|
+
@home_url = get_home_url(url)
|
9
|
+
@auchor_selector = auchor_selector
|
10
|
+
@intr_selector = intr_selector
|
11
|
+
get_link_titles
|
12
|
+
get_intrs
|
13
|
+
browser_close
|
14
|
+
end
|
15
|
+
|
16
|
+
def get_link_titles
|
17
|
+
@link_titles = get_tags_attrs_from(@url, @auchor_selector, 'href', 'text')
|
18
|
+
end
|
19
|
+
|
20
|
+
def get_intrs
|
21
|
+
raise "#{@link_titles} is nil" if @link_titles.nil?
|
22
|
+
@link_title_intrs = @link_titles.dup
|
23
|
+
@link_title_intrs.each do |link_content|
|
24
|
+
if link_content[0][/http/]
|
25
|
+
link = link_content[0]
|
26
|
+
else
|
27
|
+
link = "#{@home_url}#{link_content[0]}"
|
28
|
+
end
|
29
|
+
|
30
|
+
link_content << intr = get_tags_attrs_from(link, @intr_selector, 'text')
|
31
|
+
raise "crawler find multi intrs according #{@intr_selector}, please make @intr_selector more exactly, error intr is #{intr}" unless intr.is_a? String
|
32
|
+
link_content
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def handler(&blk)
|
37
|
+
handle @link_title_intrs, &blk
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#require 'nokogiri'
|
2
|
+
#module RoRoCrawler
|
3
|
+
# module Methods
|
4
|
+
# include ::RoRoSupport::Crawler
|
5
|
+
# def get_intr_from(html)
|
6
|
+
# doc = Nokogiri::HTML.parse html
|
7
|
+
# doc.css('.jobIntro, .j_i')
|
8
|
+
# .gsub(/(本站提醒:如何识别虚假招聘信息?求职必看,切勿受骗上当!)|(如何写一份简单、直接、高效的求职信?)/, '')
|
9
|
+
# end
|
10
|
+
#
|
11
|
+
# def get_links_contents_from(url, selector)
|
12
|
+
# as = @browser.css @link_selector
|
13
|
+
# unless as.nil?
|
14
|
+
# @offers = as.collect('text', 'href')
|
15
|
+
# end
|
16
|
+
# end
|
17
|
+
# end
|
18
|
+
#end
|
data/lib/roro_crawler/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
module
|
2
|
-
VERSION = "0.0.
|
1
|
+
module RoRoCrawler
|
2
|
+
VERSION = "0.0.4"
|
3
3
|
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe ::RoRoCrawler::Base do
|
4
|
+
before do
|
5
|
+
#Headless.new.start
|
6
|
+
end
|
7
|
+
it 'spider yjs' do
|
8
|
+
url = 'http://s.yingjiesheng.com/result.jsp?keyword=%E5%89%8D%E7%AB%AF%E5%AE%9E%E4%B9%A0&city=0&jobtype=0&do=1&stype=0'
|
9
|
+
expect(
|
10
|
+
::RoRoCrawler::Base.new.spider(url, 'h3.title>a', '.job, .j_i')
|
11
|
+
).not_to raise_error
|
12
|
+
end
|
13
|
+
|
14
|
+
describe 'spider v2ex' do
|
15
|
+
before do
|
16
|
+
@v2ex_spider = Base.new
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'case1' do
|
20
|
+
url = 'http://www.v2ex.com/go/jobs?p=1'
|
21
|
+
expect(
|
22
|
+
::RoRoCrawler::Base.new.spider(url, 'span.item_title>a', '.topic_content')
|
23
|
+
).not_to raise_error
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'case2' do
|
27
|
+
@v2ex_spider.instance_variable_set(:@link_titles, [
|
28
|
+
["/t/79990#reply63", "[北京/杭州] 阿里巴巴2014校园招聘优秀人才内部同事推荐计划"]
|
29
|
+
])
|
30
|
+
@v2ex_spider.instance_variable_set(:@home_url, "http://www.v2ex.com")
|
31
|
+
@v2ex_spider.instance_variable_set(:@intr_selector, ".cell>.topic_content")
|
32
|
+
@v2ex_spider.get_intrs
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'spider ruby-china' do
|
37
|
+
url = 'http://ruby-china.org/topics/node25'
|
38
|
+
expect(
|
39
|
+
::RoRoCrawler::Base.new.spider(url, '.title>a', '.entry_content')
|
40
|
+
).not_to raise_error
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'spider ruby-china intr not nil' do
|
44
|
+
expect(
|
45
|
+
get_tags_attrs_from('http://ruby-china.org/topics/13700', '.body.entry-content', 'inner_html')
|
46
|
+
).not_to be_empty
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'handler' do
|
50
|
+
::RoRoCrawler::Base.class_eval do
|
51
|
+
def handler(&blk)
|
52
|
+
results = [
|
53
|
+
[1, 2, 3],
|
54
|
+
[4, 5, 6],
|
55
|
+
[7, 8, 9]
|
56
|
+
]
|
57
|
+
|
58
|
+
handle results, &blk
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
::RoRoCrawler::Base.new.handler do |a, b, c|
|
63
|
+
expect(a.to_s).to match /^\d$/
|
64
|
+
expect(b.to_s).to match /^\d$/
|
65
|
+
expect(c.to_s).to match /^\d$/
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'spec_helper'
|
data/spec/spec_helper.rb
CHANGED
@@ -1,13 +1,17 @@
|
|
1
|
-
|
2
|
-
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
-
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
-
# loaded once.
|
5
|
-
#
|
6
|
-
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
1
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
7
2
|
Bundler.require
|
8
|
-
require '
|
9
|
-
require
|
3
|
+
require 'roro_support'
|
4
|
+
require 'roro_crawler'
|
5
|
+
include ::RoRoSupport::Crawler
|
6
|
+
include ::RoRoCrawler
|
10
7
|
RSpec.configure do |config|
|
11
|
-
config.
|
12
|
-
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
config.run_all_when_everything_filtered = true
|
10
|
+
config.filter_run :focus
|
11
|
+
|
12
|
+
# Run specs in random order to surface order dependencies. If you find an
|
13
|
+
# order dependency and want to debug it, you can fix the order by providing
|
14
|
+
# the seed, which is printed after each run.
|
15
|
+
# --seed 1234
|
16
|
+
config.order = 'random'
|
13
17
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: roro_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- roro
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-09-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -24,26 +24,81 @@ dependencies:
|
|
24
24
|
- - ~>
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 4.0.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: watir-rails
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: headless
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: grit
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
27
83
|
description: ''
|
28
84
|
email:
|
29
|
-
-
|
85
|
+
- zhuxingruo3@gmail.com
|
30
86
|
executables: []
|
31
87
|
extensions: []
|
32
88
|
extra_rdoc_files: []
|
33
89
|
files:
|
34
|
-
- lib/roro_crawler/crawler_handler.rb
|
35
90
|
- lib/roro_crawler/version.rb
|
36
|
-
- lib/roro_crawler/
|
91
|
+
- lib/roro_crawler/base.rb
|
92
|
+
- lib/roro_crawler/methods.rb
|
37
93
|
- lib/roro_crawler.rb
|
38
|
-
- lib/tasks/crawler_tasks.rake
|
39
94
|
- MIT-LICENSE
|
40
95
|
- Rakefile
|
41
96
|
- README.rdoc
|
42
97
|
- spec/fixtures/intr.html
|
43
98
|
- spec/fixtures/list.html
|
44
99
|
- spec/spec_helper.rb
|
45
|
-
- spec/
|
46
|
-
- spec/
|
100
|
+
- spec/roro_crawler/base_spec.rb
|
101
|
+
- spec/roro_crawler_spec.rb
|
47
102
|
homepage: ''
|
48
103
|
licenses: []
|
49
104
|
metadata: {}
|
@@ -71,5 +126,5 @@ test_files:
|
|
71
126
|
- spec/fixtures/intr.html
|
72
127
|
- spec/fixtures/list.html
|
73
128
|
- spec/spec_helper.rb
|
74
|
-
- spec/
|
75
|
-
- spec/
|
129
|
+
- spec/roro_crawler/base_spec.rb
|
130
|
+
- spec/roro_crawler_spec.rb
|
@@ -1,98 +0,0 @@
|
|
1
|
-
require 'roro_support'
|
2
|
-
require 'active_support'
|
3
|
-
require File.expand_path('../crawler_handler', __FILE__)
|
4
|
-
|
5
|
-
include Crawler
|
6
|
-
|
7
|
-
module Crawler
|
8
|
-
class Klass
|
9
|
-
attr_accessor :offers, :browser, :link, :title, :intr, :page
|
10
|
-
|
11
|
-
def initialize(options={})
|
12
|
-
@visible = options[:visible]
|
13
|
-
@link_selector = 'h3.title>a'
|
14
|
-
@offers = Hash.new
|
15
|
-
@page = 0
|
16
|
-
|
17
|
-
if @visible
|
18
|
-
@browser = crawler visible: @visible
|
19
|
-
else
|
20
|
-
@browser = crawler
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
def goto_next
|
25
|
-
@page += 1
|
26
|
-
@browser.goto "http://s.yingjiesheng.com/result.jsp?keyword=web&start=#{@page*10}&period=0&sort=score&jobtype=0"
|
27
|
-
end
|
28
|
-
|
29
|
-
def site(keyword, page_num, options={})
|
30
|
-
if url.nil?
|
31
|
-
p <<-MSG
|
32
|
-
please add
|
33
|
-
def site
|
34
|
-
url = 'http://website.com'
|
35
|
-
super
|
36
|
-
end
|
37
|
-
MSG
|
38
|
-
end
|
39
|
-
@browser.goto url
|
40
|
-
|
41
|
-
page_num.to_i.times do
|
42
|
-
links
|
43
|
-
link_contents
|
44
|
-
"http://s.yingjiesheng.com/result.jsp?keyword=web&start=#{@page*10}&period=0&sort=score&jobtype=0"
|
45
|
-
goto_next
|
46
|
-
end
|
47
|
-
|
48
|
-
@browser.close
|
49
|
-
end
|
50
|
-
|
51
|
-
|
52
|
-
def close
|
53
|
-
@browser.close
|
54
|
-
end
|
55
|
-
|
56
|
-
|
57
|
-
def links
|
58
|
-
as = @browser.css @link_selector
|
59
|
-
unless as.nil?
|
60
|
-
@offers = as.
|
61
|
-
|
62
|
-
collect('text', 'href')
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
def link_contents
|
67
|
-
@offers.each do |title, link|
|
68
|
-
if title && link
|
69
|
-
@title = title
|
70
|
-
@link = link
|
71
|
-
@intr = msg link
|
72
|
-
safe_save
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
def safe_save
|
78
|
-
msg = <<-MSG
|
79
|
-
title: #{@title}
|
80
|
-
link: #{@link}
|
81
|
-
MSG
|
82
|
-
|
83
|
-
Rails.logger.info msg
|
84
|
-
|
85
|
-
p @intr
|
86
|
-
return if @intr.nil?
|
87
|
-
Offer.create(link: @link, title: @title, intr: @intr, from: 'yjs')
|
88
|
-
end
|
89
|
-
|
90
|
-
|
91
|
-
def msg(href)
|
92
|
-
if href[/http\:\/\/www\.yingjiesheng\.com\/job\-\w+/]
|
93
|
-
@browser.goto href
|
94
|
-
return Handler.get_intr_from(@browser.html)
|
95
|
-
end
|
96
|
-
end
|
97
|
-
end
|
98
|
-
end
|
@@ -1,13 +0,0 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
module Crawler
|
3
|
-
module Handler
|
4
|
-
class << self
|
5
|
-
def get_intr_from(html)
|
6
|
-
doc = Nokogiri::HTML.parse html
|
7
|
-
doc.css('.jobIntro, .j_i')
|
8
|
-
.text
|
9
|
-
.gsub(/(本站提醒:如何识别虚假招聘信息?求职必看,切勿受骗上当!)|(如何写一份简单、直接、高效的求职信?)/, '')
|
10
|
-
end
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
data/spec/crawler_class_spec.rb
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require 'roro_support'
|
3
|
-
|
4
|
-
describe 'Crawler' do
|
5
|
-
before do
|
6
|
-
end
|
7
|
-
|
8
|
-
after do
|
9
|
-
if @c.browser
|
10
|
-
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
describe "links" do
|
15
|
-
before do
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
it 'links can get links correctly' do
|
20
|
-
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
@@ -1,21 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
include Crawler
|
3
|
-
|
4
|
-
describe "Handler" do
|
5
|
-
describe 'get_intr_from' do
|
6
|
-
before :all do
|
7
|
-
@pass = lambda do
|
8
|
-
content = File.read(@fixtures[:intr])
|
9
|
-
intr = Handler::get_intr_from content
|
10
|
-
print intr
|
11
|
-
expect(intr.length).to be < 1000
|
12
|
-
expect(intr).not_to include "本站提醒:如何识别虚假招聘信息?求职必看,切勿受骗上当!"
|
13
|
-
expect(intr).not_to include "如何写一份简单、直接、高效的求职信?"
|
14
|
-
end
|
15
|
-
|
16
|
-
end
|
17
|
-
it 'pass spec1' do
|
18
|
-
@pass.call
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|