roro_crawler 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c624947b38a0356ad347fe2305b5ca5b42b26a80
4
- data.tar.gz: 17658d2929d346b2302448288876d48587ac88c9
3
+ metadata.gz: 9d5915884d8cf5013726ac53c6373dab6061f984
4
+ data.tar.gz: a83f6c9f6f7019b7d58a08588e63989b1fbf6ca9
5
5
  SHA512:
6
- metadata.gz: 016a1722e2afbf2726515d71f7f1b4b26288ee3dc6b4035c7313ea49c129f9040c240754c0d69befa54793b1e7475b7ad61fac0d875c5db3be964338e6a7a4d6
7
- data.tar.gz: eb792990a4f3f2935356771aef37cbadfdb066660fba4a5fbf0ee82ae3718b125bf7183cdaf37efa759cb56152c3bc59715659459146e806670610c81a5aba89
6
+ metadata.gz: b3b91e75d2b5cf527ec7c2bdd7d783ece4002fbf6374403dbe4eeeed551d12488ee7487b044602b585e71cf3f954a8a6702df5d5fd22fc4af08c8dbddae9b03e
7
+ data.tar.gz: a84a083837e112f18403a43e60b162b2430d2cb326614a2cee2578bb2d8849905dd386abc7348ff91797dfb052987f12b99a8d0a6d7c44c77c174dc196602279
data/Rakefile CHANGED
@@ -16,7 +16,6 @@ end
16
16
 
17
17
 
18
18
 
19
-
20
19
  Bundler::GemHelper.install_tasks
21
20
 
22
21
  require 'rake/testtask'
@@ -30,3 +29,9 @@ end
30
29
 
31
30
 
32
31
  task default: :test
32
+
33
+ require 'rspec/core/rake_task'
34
+
35
+ RSpec::Core::RakeTask.new
36
+
37
+ task default: :spec
@@ -1,5 +1,7 @@
1
1
  # require all files in dir name is same with __FILE_-
2
- dirname = __FILE__.split('/').last.gsub(/\.rb/, '')
3
- Dir[File.expand_path("../#{dirname}/*", __FILE__)].each do |file|
4
- require file
5
- end
2
+ require 'roro_support'
3
+ require 'headless'
4
+
5
+ $LOAD_PATH.unshift File.expand_path('../roro_crawler', __FILE__)
6
+ require 'methods'
7
+ require 'base'
@@ -0,0 +1,40 @@
1
+ module RoRoCrawler
2
+ class Base
3
+
4
+ include RoRoSupport::Crawler
5
+
6
+ def spider(url, auchor_selector, intr_selector)
7
+ @url = url
8
+ @home_url = get_home_url(url)
9
+ @auchor_selector = auchor_selector
10
+ @intr_selector = intr_selector
11
+ get_link_titles
12
+ get_intrs
13
+ browser_close
14
+ end
15
+
16
+ def get_link_titles
17
+ @link_titles = get_tags_attrs_from(@url, @auchor_selector, 'href', 'text')
18
+ end
19
+
20
+ def get_intrs
21
+ raise "#{@link_titles} is nil" if @link_titles.nil?
22
+ @link_title_intrs = @link_titles.dup
23
+ @link_title_intrs.each do |link_content|
24
+ if link_content[0][/http/]
25
+ link = link_content[0]
26
+ else
27
+ link = "#{@home_url}#{link_content[0]}"
28
+ end
29
+
30
+ link_content << intr = get_tags_attrs_from(link, @intr_selector, 'text')
31
+ raise "crawler find multi intrs according #{@intr_selector}, please make @intr_selector more exactly, error intr is #{intr}" unless intr.is_a? String
32
+ link_content
33
+ end
34
+ end
35
+
36
+ def handler(&blk)
37
+ handle @link_title_intrs, &blk
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,18 @@
1
+ #require 'nokogiri'
2
+ #module RoRoCrawler
3
+ # module Methods
4
+ # include ::RoRoSupport::Crawler
5
+ # def get_intr_from(html)
6
+ # doc = Nokogiri::HTML.parse html
7
+ # doc.css('.jobIntro, .j_i')
8
+ # .gsub(/(本站提醒:如何识别虚假招聘信息?求职必看,切勿受骗上当!)|(如何写一份简单、直接、高效的求职信?)/, '')
9
+ # end
10
+ #
11
+ # def get_links_contents_from(url, selector)
12
+ # as = @browser.css @link_selector
13
+ # unless as.nil?
14
+ # @offers = as.collect('text', 'href')
15
+ # end
16
+ # end
17
+ # end
18
+ #end
@@ -1,3 +1,3 @@
1
- module Crawler
2
- VERSION = "0.0.3"
1
+ module RoRoCrawler
2
+ VERSION = "0.0.4"
3
3
  end
@@ -0,0 +1,68 @@
1
+ require 'spec_helper'
2
+
3
+ describe ::RoRoCrawler::Base do
4
+ before do
5
+ #Headless.new.start
6
+ end
7
+ it 'spider yjs' do
8
+ url = 'http://s.yingjiesheng.com/result.jsp?keyword=%E5%89%8D%E7%AB%AF%E5%AE%9E%E4%B9%A0&city=0&jobtype=0&do=1&stype=0'
9
+ expect(
10
+ ::RoRoCrawler::Base.new.spider(url, 'h3.title>a', '.job, .j_i')
11
+ ).not_to raise_error
12
+ end
13
+
14
+ describe 'spider v2ex' do
15
+ before do
16
+ @v2ex_spider = Base.new
17
+ end
18
+
19
+ it 'case1' do
20
+ url = 'http://www.v2ex.com/go/jobs?p=1'
21
+ expect(
22
+ ::RoRoCrawler::Base.new.spider(url, 'span.item_title>a', '.topic_content')
23
+ ).not_to raise_error
24
+ end
25
+
26
+ it 'case2' do
27
+ @v2ex_spider.instance_variable_set(:@link_titles, [
28
+ ["/t/79990#reply63", "[北京/杭州] 阿里巴巴2014校园招聘优秀人才内部同事推荐计划"]
29
+ ])
30
+ @v2ex_spider.instance_variable_set(:@home_url, "http://www.v2ex.com")
31
+ @v2ex_spider.instance_variable_set(:@intr_selector, ".cell>.topic_content")
32
+ @v2ex_spider.get_intrs
33
+ end
34
+ end
35
+
36
+ it 'spider ruby-china' do
37
+ url = 'http://ruby-china.org/topics/node25'
38
+ expect(
39
+ ::RoRoCrawler::Base.new.spider(url, '.title>a', '.entry_content')
40
+ ).not_to raise_error
41
+ end
42
+
43
+ it 'spider ruby-china intr not nil' do
44
+ expect(
45
+ get_tags_attrs_from('http://ruby-china.org/topics/13700', '.body.entry-content', 'inner_html')
46
+ ).not_to be_empty
47
+ end
48
+
49
+ it 'handler' do
50
+ ::RoRoCrawler::Base.class_eval do
51
+ def handler(&blk)
52
+ results = [
53
+ [1, 2, 3],
54
+ [4, 5, 6],
55
+ [7, 8, 9]
56
+ ]
57
+
58
+ handle results, &blk
59
+ end
60
+ end
61
+
62
+ ::RoRoCrawler::Base.new.handler do |a, b, c|
63
+ expect(a.to_s).to match /^\d$/
64
+ expect(b.to_s).to match /^\d$/
65
+ expect(c.to_s).to match /^\d$/
66
+ end
67
+ end
68
+ end
@@ -0,0 +1 @@
1
+ require 'spec_helper'
@@ -1,13 +1,17 @@
1
- # This file was generated by the `rspec --init` command. Conventionally, all
2
- # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
- # Require this file using `require "spec_helper"` to ensure that it is only
4
- # loaded once.
5
- #
6
- # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
1
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
7
2
  Bundler.require
8
- require 'active_support'
9
- require File.expand_path('../../lib/roro_crawler', __FILE__)
3
+ require 'roro_support'
4
+ require 'roro_crawler'
5
+ include ::RoRoSupport::Crawler
6
+ include ::RoRoCrawler
10
7
  RSpec.configure do |config|
11
- config.before :all do
12
- end
8
+ config.treat_symbols_as_metadata_keys_with_true_values = true
9
+ config.run_all_when_everything_filtered = true
10
+ config.filter_run :focus
11
+
12
+ # Run specs in random order to surface order dependencies. If you find an
13
+ # order dependency and want to debug it, you can fix the order by providing
14
+ # the seed, which is printed after each run.
15
+ # --seed 1234
16
+ config.order = 'random'
13
17
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: roro_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - roro
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-08-25 00:00:00.000000000 Z
11
+ date: 2013-09-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rails
@@ -24,26 +24,81 @@ dependencies:
24
24
  - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: 4.0.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: watir-rails
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: headless
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: grit
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
27
83
  description: ''
28
84
  email:
29
- - roro@gmail.com
85
+ - zhuxingruo3@gmail.com
30
86
  executables: []
31
87
  extensions: []
32
88
  extra_rdoc_files: []
33
89
  files:
34
- - lib/roro_crawler/crawler_handler.rb
35
90
  - lib/roro_crawler/version.rb
36
- - lib/roro_crawler/crawler_class.rb
91
+ - lib/roro_crawler/base.rb
92
+ - lib/roro_crawler/methods.rb
37
93
  - lib/roro_crawler.rb
38
- - lib/tasks/crawler_tasks.rake
39
94
  - MIT-LICENSE
40
95
  - Rakefile
41
96
  - README.rdoc
42
97
  - spec/fixtures/intr.html
43
98
  - spec/fixtures/list.html
44
99
  - spec/spec_helper.rb
45
- - spec/crawler_handler_spec.rb
46
- - spec/crawler_class_spec.rb
100
+ - spec/roro_crawler/base_spec.rb
101
+ - spec/roro_crawler_spec.rb
47
102
  homepage: ''
48
103
  licenses: []
49
104
  metadata: {}
@@ -71,5 +126,5 @@ test_files:
71
126
  - spec/fixtures/intr.html
72
127
  - spec/fixtures/list.html
73
128
  - spec/spec_helper.rb
74
- - spec/crawler_handler_spec.rb
75
- - spec/crawler_class_spec.rb
129
+ - spec/roro_crawler/base_spec.rb
130
+ - spec/roro_crawler_spec.rb
@@ -1,98 +0,0 @@
1
- require 'roro_support'
2
- require 'active_support'
3
- require File.expand_path('../crawler_handler', __FILE__)
4
-
5
- include Crawler
6
-
7
- module Crawler
8
- class Klass
9
- attr_accessor :offers, :browser, :link, :title, :intr, :page
10
-
11
- def initialize(options={})
12
- @visible = options[:visible]
13
- @link_selector = 'h3.title>a'
14
- @offers = Hash.new
15
- @page = 0
16
-
17
- if @visible
18
- @browser = crawler visible: @visible
19
- else
20
- @browser = crawler
21
- end
22
- end
23
-
24
- def goto_next
25
- @page += 1
26
- @browser.goto "http://s.yingjiesheng.com/result.jsp?keyword=web&start=#{@page*10}&period=0&sort=score&jobtype=0"
27
- end
28
-
29
- def site(keyword, page_num, options={})
30
- if url.nil?
31
- p <<-MSG
32
- please add
33
- def site
34
- url = 'http://website.com'
35
- super
36
- end
37
- MSG
38
- end
39
- @browser.goto url
40
-
41
- page_num.to_i.times do
42
- links
43
- link_contents
44
- "http://s.yingjiesheng.com/result.jsp?keyword=web&start=#{@page*10}&period=0&sort=score&jobtype=0"
45
- goto_next
46
- end
47
-
48
- @browser.close
49
- end
50
-
51
-
52
- def close
53
- @browser.close
54
- end
55
-
56
-
57
- def links
58
- as = @browser.css @link_selector
59
- unless as.nil?
60
- @offers = as.
61
-
62
- collect('text', 'href')
63
- end
64
- end
65
-
66
- def link_contents
67
- @offers.each do |title, link|
68
- if title && link
69
- @title = title
70
- @link = link
71
- @intr = msg link
72
- safe_save
73
- end
74
- end
75
- end
76
-
77
- def safe_save
78
- msg = <<-MSG
79
- title: #{@title}
80
- link: #{@link}
81
- MSG
82
-
83
- Rails.logger.info msg
84
-
85
- p @intr
86
- return if @intr.nil?
87
- Offer.create(link: @link, title: @title, intr: @intr, from: 'yjs')
88
- end
89
-
90
-
91
- def msg(href)
92
- if href[/http\:\/\/www\.yingjiesheng\.com\/job\-\w+/]
93
- @browser.goto href
94
- return Handler.get_intr_from(@browser.html)
95
- end
96
- end
97
- end
98
- end
@@ -1,13 +0,0 @@
1
- require 'nokogiri'
2
- module Crawler
3
- module Handler
4
- class << self
5
- def get_intr_from(html)
6
- doc = Nokogiri::HTML.parse html
7
- doc.css('.jobIntro, .j_i')
8
- .text
9
- .gsub(/(本站提醒:如何识别虚假招聘信息?求职必看,切勿受骗上当!)|(如何写一份简单、直接、高效的求职信?)/, '')
10
- end
11
- end
12
- end
13
- end
@@ -1,4 +0,0 @@
1
- # desc "Explaining what the task does"
2
- # task :roro_crawler do
3
- # # Task goes here
4
- # end
@@ -1,23 +0,0 @@
1
- require 'spec_helper'
2
- require 'roro_support'
3
-
4
- describe 'Crawler' do
5
- before do
6
- end
7
-
8
- after do
9
- if @c.browser
10
-
11
- end
12
- end
13
-
14
- describe "links" do
15
- before do
16
-
17
- end
18
-
19
- it 'links can get links correctly' do
20
-
21
- end
22
- end
23
- end
@@ -1,21 +0,0 @@
1
- require 'spec_helper'
2
- include Crawler
3
-
4
- describe "Handler" do
5
- describe 'get_intr_from' do
6
- before :all do
7
- @pass = lambda do
8
- content = File.read(@fixtures[:intr])
9
- intr = Handler::get_intr_from content
10
- print intr
11
- expect(intr.length).to be < 1000
12
- expect(intr).not_to include "本站提醒:如何识别虚假招聘信息?求职必看,切勿受骗上当!"
13
- expect(intr).not_to include "如何写一份简单、直接、高效的求职信?"
14
- end
15
-
16
- end
17
- it 'pass spec1' do
18
- @pass.call
19
- end
20
- end
21
- end