roro_crawler 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +6 -1
- data/lib/roro_crawler.rb +6 -4
- data/lib/roro_crawler/base.rb +40 -0
- data/lib/roro_crawler/methods.rb +18 -0
- data/lib/roro_crawler/version.rb +2 -2
- data/spec/roro_crawler/base_spec.rb +68 -0
- data/spec/roro_crawler_spec.rb +1 -0
- data/spec/spec_helper.rb +14 -10
- metadata +65 -10
- data/lib/roro_crawler/crawler_class.rb +0 -98
- data/lib/roro_crawler/crawler_handler.rb +0 -13
- data/lib/tasks/crawler_tasks.rake +0 -4
- data/spec/crawler_class_spec.rb +0 -23
- data/spec/crawler_handler_spec.rb +0 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9d5915884d8cf5013726ac53c6373dab6061f984
|
4
|
+
data.tar.gz: a83f6c9f6f7019b7d58a08588e63989b1fbf6ca9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b3b91e75d2b5cf527ec7c2bdd7d783ece4002fbf6374403dbe4eeeed551d12488ee7487b044602b585e71cf3f954a8a6702df5d5fd22fc4af08c8dbddae9b03e
|
7
|
+
data.tar.gz: a84a083837e112f18403a43e60b162b2430d2cb326614a2cee2578bb2d8849905dd386abc7348ff91797dfb052987f12b99a8d0a6d7c44c77c174dc196602279
|
data/Rakefile
CHANGED
data/lib/roro_crawler.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# require all files in dir name is same with __FILE_-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
2
|
+
require 'roro_support'
|
3
|
+
require 'headless'
|
4
|
+
|
5
|
+
$LOAD_PATH.unshift File.expand_path('../roro_crawler', __FILE__)
|
6
|
+
require 'methods'
|
7
|
+
require 'base'
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module RoRoCrawler
|
2
|
+
class Base
|
3
|
+
|
4
|
+
include RoRoSupport::Crawler
|
5
|
+
|
6
|
+
def spider(url, auchor_selector, intr_selector)
|
7
|
+
@url = url
|
8
|
+
@home_url = get_home_url(url)
|
9
|
+
@auchor_selector = auchor_selector
|
10
|
+
@intr_selector = intr_selector
|
11
|
+
get_link_titles
|
12
|
+
get_intrs
|
13
|
+
browser_close
|
14
|
+
end
|
15
|
+
|
16
|
+
def get_link_titles
|
17
|
+
@link_titles = get_tags_attrs_from(@url, @auchor_selector, 'href', 'text')
|
18
|
+
end
|
19
|
+
|
20
|
+
def get_intrs
|
21
|
+
raise "#{@link_titles} is nil" if @link_titles.nil?
|
22
|
+
@link_title_intrs = @link_titles.dup
|
23
|
+
@link_title_intrs.each do |link_content|
|
24
|
+
if link_content[0][/http/]
|
25
|
+
link = link_content[0]
|
26
|
+
else
|
27
|
+
link = "#{@home_url}#{link_content[0]}"
|
28
|
+
end
|
29
|
+
|
30
|
+
link_content << intr = get_tags_attrs_from(link, @intr_selector, 'text')
|
31
|
+
raise "crawler find multi intrs according #{@intr_selector}, please make @intr_selector more exactly, error intr is #{intr}" unless intr.is_a? String
|
32
|
+
link_content
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def handler(&blk)
|
37
|
+
handle @link_title_intrs, &blk
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#require 'nokogiri'
|
2
|
+
#module RoRoCrawler
|
3
|
+
# module Methods
|
4
|
+
# include ::RoRoSupport::Crawler
|
5
|
+
# def get_intr_from(html)
|
6
|
+
# doc = Nokogiri::HTML.parse html
|
7
|
+
# doc.css('.jobIntro, .j_i')
|
8
|
+
# .gsub(/(本站提醒:如何识别虚假招聘信息?求职必看,切勿受骗上当!)|(如何写一份简单、直接、高效的求职信?)/, '')
|
9
|
+
# end
|
10
|
+
#
|
11
|
+
# def get_links_contents_from(url, selector)
|
12
|
+
# as = @browser.css @link_selector
|
13
|
+
# unless as.nil?
|
14
|
+
# @offers = as.collect('text', 'href')
|
15
|
+
# end
|
16
|
+
# end
|
17
|
+
# end
|
18
|
+
#end
|
data/lib/roro_crawler/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
module
|
2
|
-
VERSION = "0.0.
|
1
|
+
module RoRoCrawler
|
2
|
+
VERSION = "0.0.4"
|
3
3
|
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe ::RoRoCrawler::Base do
|
4
|
+
before do
|
5
|
+
#Headless.new.start
|
6
|
+
end
|
7
|
+
it 'spider yjs' do
|
8
|
+
url = 'http://s.yingjiesheng.com/result.jsp?keyword=%E5%89%8D%E7%AB%AF%E5%AE%9E%E4%B9%A0&city=0&jobtype=0&do=1&stype=0'
|
9
|
+
expect(
|
10
|
+
::RoRoCrawler::Base.new.spider(url, 'h3.title>a', '.job, .j_i')
|
11
|
+
).not_to raise_error
|
12
|
+
end
|
13
|
+
|
14
|
+
describe 'spider v2ex' do
|
15
|
+
before do
|
16
|
+
@v2ex_spider = Base.new
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'case1' do
|
20
|
+
url = 'http://www.v2ex.com/go/jobs?p=1'
|
21
|
+
expect(
|
22
|
+
::RoRoCrawler::Base.new.spider(url, 'span.item_title>a', '.topic_content')
|
23
|
+
).not_to raise_error
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'case2' do
|
27
|
+
@v2ex_spider.instance_variable_set(:@link_titles, [
|
28
|
+
["/t/79990#reply63", "[北京/杭州] 阿里巴巴2014校园招聘优秀人才内部同事推荐计划"]
|
29
|
+
])
|
30
|
+
@v2ex_spider.instance_variable_set(:@home_url, "http://www.v2ex.com")
|
31
|
+
@v2ex_spider.instance_variable_set(:@intr_selector, ".cell>.topic_content")
|
32
|
+
@v2ex_spider.get_intrs
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'spider ruby-china' do
|
37
|
+
url = 'http://ruby-china.org/topics/node25'
|
38
|
+
expect(
|
39
|
+
::RoRoCrawler::Base.new.spider(url, '.title>a', '.entry_content')
|
40
|
+
).not_to raise_error
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'spider ruby-china intr not nil' do
|
44
|
+
expect(
|
45
|
+
get_tags_attrs_from('http://ruby-china.org/topics/13700', '.body.entry-content', 'inner_html')
|
46
|
+
).not_to be_empty
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'handler' do
|
50
|
+
::RoRoCrawler::Base.class_eval do
|
51
|
+
def handler(&blk)
|
52
|
+
results = [
|
53
|
+
[1, 2, 3],
|
54
|
+
[4, 5, 6],
|
55
|
+
[7, 8, 9]
|
56
|
+
]
|
57
|
+
|
58
|
+
handle results, &blk
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
::RoRoCrawler::Base.new.handler do |a, b, c|
|
63
|
+
expect(a.to_s).to match /^\d$/
|
64
|
+
expect(b.to_s).to match /^\d$/
|
65
|
+
expect(c.to_s).to match /^\d$/
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'spec_helper'
|
data/spec/spec_helper.rb
CHANGED
@@ -1,13 +1,17 @@
|
|
1
|
-
|
2
|
-
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
-
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
-
# loaded once.
|
5
|
-
#
|
6
|
-
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
1
|
+
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
|
7
2
|
Bundler.require
|
8
|
-
require '
|
9
|
-
require
|
3
|
+
require 'roro_support'
|
4
|
+
require 'roro_crawler'
|
5
|
+
include ::RoRoSupport::Crawler
|
6
|
+
include ::RoRoCrawler
|
10
7
|
RSpec.configure do |config|
|
11
|
-
config.
|
12
|
-
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
config.run_all_when_everything_filtered = true
|
10
|
+
config.filter_run :focus
|
11
|
+
|
12
|
+
# Run specs in random order to surface order dependencies. If you find an
|
13
|
+
# order dependency and want to debug it, you can fix the order by providing
|
14
|
+
# the seed, which is printed after each run.
|
15
|
+
# --seed 1234
|
16
|
+
config.order = 'random'
|
13
17
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: roro_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- roro
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-09-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -24,26 +24,81 @@ dependencies:
|
|
24
24
|
- - ~>
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 4.0.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: watir-rails
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: headless
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: grit
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
27
83
|
description: ''
|
28
84
|
email:
|
29
|
-
-
|
85
|
+
- zhuxingruo3@gmail.com
|
30
86
|
executables: []
|
31
87
|
extensions: []
|
32
88
|
extra_rdoc_files: []
|
33
89
|
files:
|
34
|
-
- lib/roro_crawler/crawler_handler.rb
|
35
90
|
- lib/roro_crawler/version.rb
|
36
|
-
- lib/roro_crawler/
|
91
|
+
- lib/roro_crawler/base.rb
|
92
|
+
- lib/roro_crawler/methods.rb
|
37
93
|
- lib/roro_crawler.rb
|
38
|
-
- lib/tasks/crawler_tasks.rake
|
39
94
|
- MIT-LICENSE
|
40
95
|
- Rakefile
|
41
96
|
- README.rdoc
|
42
97
|
- spec/fixtures/intr.html
|
43
98
|
- spec/fixtures/list.html
|
44
99
|
- spec/spec_helper.rb
|
45
|
-
- spec/
|
46
|
-
- spec/
|
100
|
+
- spec/roro_crawler/base_spec.rb
|
101
|
+
- spec/roro_crawler_spec.rb
|
47
102
|
homepage: ''
|
48
103
|
licenses: []
|
49
104
|
metadata: {}
|
@@ -71,5 +126,5 @@ test_files:
|
|
71
126
|
- spec/fixtures/intr.html
|
72
127
|
- spec/fixtures/list.html
|
73
128
|
- spec/spec_helper.rb
|
74
|
-
- spec/
|
75
|
-
- spec/
|
129
|
+
- spec/roro_crawler/base_spec.rb
|
130
|
+
- spec/roro_crawler_spec.rb
|
@@ -1,98 +0,0 @@
|
|
1
|
-
require 'roro_support'
|
2
|
-
require 'active_support'
|
3
|
-
require File.expand_path('../crawler_handler', __FILE__)
|
4
|
-
|
5
|
-
include Crawler
|
6
|
-
|
7
|
-
module Crawler
|
8
|
-
class Klass
|
9
|
-
attr_accessor :offers, :browser, :link, :title, :intr, :page
|
10
|
-
|
11
|
-
def initialize(options={})
|
12
|
-
@visible = options[:visible]
|
13
|
-
@link_selector = 'h3.title>a'
|
14
|
-
@offers = Hash.new
|
15
|
-
@page = 0
|
16
|
-
|
17
|
-
if @visible
|
18
|
-
@browser = crawler visible: @visible
|
19
|
-
else
|
20
|
-
@browser = crawler
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
def goto_next
|
25
|
-
@page += 1
|
26
|
-
@browser.goto "http://s.yingjiesheng.com/result.jsp?keyword=web&start=#{@page*10}&period=0&sort=score&jobtype=0"
|
27
|
-
end
|
28
|
-
|
29
|
-
def site(keyword, page_num, options={})
|
30
|
-
if url.nil?
|
31
|
-
p <<-MSG
|
32
|
-
please add
|
33
|
-
def site
|
34
|
-
url = 'http://website.com'
|
35
|
-
super
|
36
|
-
end
|
37
|
-
MSG
|
38
|
-
end
|
39
|
-
@browser.goto url
|
40
|
-
|
41
|
-
page_num.to_i.times do
|
42
|
-
links
|
43
|
-
link_contents
|
44
|
-
"http://s.yingjiesheng.com/result.jsp?keyword=web&start=#{@page*10}&period=0&sort=score&jobtype=0"
|
45
|
-
goto_next
|
46
|
-
end
|
47
|
-
|
48
|
-
@browser.close
|
49
|
-
end
|
50
|
-
|
51
|
-
|
52
|
-
def close
|
53
|
-
@browser.close
|
54
|
-
end
|
55
|
-
|
56
|
-
|
57
|
-
def links
|
58
|
-
as = @browser.css @link_selector
|
59
|
-
unless as.nil?
|
60
|
-
@offers = as.
|
61
|
-
|
62
|
-
collect('text', 'href')
|
63
|
-
end
|
64
|
-
end
|
65
|
-
|
66
|
-
def link_contents
|
67
|
-
@offers.each do |title, link|
|
68
|
-
if title && link
|
69
|
-
@title = title
|
70
|
-
@link = link
|
71
|
-
@intr = msg link
|
72
|
-
safe_save
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
def safe_save
|
78
|
-
msg = <<-MSG
|
79
|
-
title: #{@title}
|
80
|
-
link: #{@link}
|
81
|
-
MSG
|
82
|
-
|
83
|
-
Rails.logger.info msg
|
84
|
-
|
85
|
-
p @intr
|
86
|
-
return if @intr.nil?
|
87
|
-
Offer.create(link: @link, title: @title, intr: @intr, from: 'yjs')
|
88
|
-
end
|
89
|
-
|
90
|
-
|
91
|
-
def msg(href)
|
92
|
-
if href[/http\:\/\/www\.yingjiesheng\.com\/job\-\w+/]
|
93
|
-
@browser.goto href
|
94
|
-
return Handler.get_intr_from(@browser.html)
|
95
|
-
end
|
96
|
-
end
|
97
|
-
end
|
98
|
-
end
|
@@ -1,13 +0,0 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
module Crawler
|
3
|
-
module Handler
|
4
|
-
class << self
|
5
|
-
def get_intr_from(html)
|
6
|
-
doc = Nokogiri::HTML.parse html
|
7
|
-
doc.css('.jobIntro, .j_i')
|
8
|
-
.text
|
9
|
-
.gsub(/(本站提醒:如何识别虚假招聘信息?求职必看,切勿受骗上当!)|(如何写一份简单、直接、高效的求职信?)/, '')
|
10
|
-
end
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
data/spec/crawler_class_spec.rb
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
require 'roro_support'
|
3
|
-
|
4
|
-
describe 'Crawler' do
|
5
|
-
before do
|
6
|
-
end
|
7
|
-
|
8
|
-
after do
|
9
|
-
if @c.browser
|
10
|
-
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
describe "links" do
|
15
|
-
before do
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
it 'links can get links correctly' do
|
20
|
-
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
@@ -1,21 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
include Crawler
|
3
|
-
|
4
|
-
describe "Handler" do
|
5
|
-
describe 'get_intr_from' do
|
6
|
-
before :all do
|
7
|
-
@pass = lambda do
|
8
|
-
content = File.read(@fixtures[:intr])
|
9
|
-
intr = Handler::get_intr_from content
|
10
|
-
print intr
|
11
|
-
expect(intr.length).to be < 1000
|
12
|
-
expect(intr).not_to include "本站提醒:如何识别虚假招聘信息?求职必看,切勿受骗上当!"
|
13
|
-
expect(intr).not_to include "如何写一份简单、直接、高效的求职信?"
|
14
|
-
end
|
15
|
-
|
16
|
-
end
|
17
|
-
it 'pass spec1' do
|
18
|
-
@pass.call
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|