roro_crawler 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c624947b38a0356ad347fe2305b5ca5b42b26a80
4
- data.tar.gz: 17658d2929d346b2302448288876d48587ac88c9
3
+ metadata.gz: 9d5915884d8cf5013726ac53c6373dab6061f984
4
+ data.tar.gz: a83f6c9f6f7019b7d58a08588e63989b1fbf6ca9
5
5
  SHA512:
6
- metadata.gz: 016a1722e2afbf2726515d71f7f1b4b26288ee3dc6b4035c7313ea49c129f9040c240754c0d69befa54793b1e7475b7ad61fac0d875c5db3be964338e6a7a4d6
7
- data.tar.gz: eb792990a4f3f2935356771aef37cbadfdb066660fba4a5fbf0ee82ae3718b125bf7183cdaf37efa759cb56152c3bc59715659459146e806670610c81a5aba89
6
+ metadata.gz: b3b91e75d2b5cf527ec7c2bdd7d783ece4002fbf6374403dbe4eeeed551d12488ee7487b044602b585e71cf3f954a8a6702df5d5fd22fc4af08c8dbddae9b03e
7
+ data.tar.gz: a84a083837e112f18403a43e60b162b2430d2cb326614a2cee2578bb2d8849905dd386abc7348ff91797dfb052987f12b99a8d0a6d7c44c77c174dc196602279
data/Rakefile CHANGED
@@ -16,7 +16,6 @@ end
16
16
 
17
17
 
18
18
 
19
-
20
19
  Bundler::GemHelper.install_tasks
21
20
 
22
21
  require 'rake/testtask'
@@ -30,3 +29,9 @@ end
30
29
 
31
30
 
32
31
  task default: :test
32
+
33
+ require 'rspec/core/rake_task'
34
+
35
+ RSpec::Core::RakeTask.new
36
+
37
+ task default: :spec
@@ -1,5 +1,7 @@
1
1
  # require all files in dir name is same with __FILE_-
2
- dirname = __FILE__.split('/').last.gsub(/\.rb/, '')
3
- Dir[File.expand_path("../#{dirname}/*", __FILE__)].each do |file|
4
- require file
5
- end
2
+ require 'roro_support'
3
+ require 'headless'
4
+
5
+ $LOAD_PATH.unshift File.expand_path('../roro_crawler', __FILE__)
6
+ require 'methods'
7
+ require 'base'
@@ -0,0 +1,40 @@
1
+ module RoRoCrawler
2
+ class Base
3
+
4
+ include RoRoSupport::Crawler
5
+
6
+ def spider(url, auchor_selector, intr_selector)
7
+ @url = url
8
+ @home_url = get_home_url(url)
9
+ @auchor_selector = auchor_selector
10
+ @intr_selector = intr_selector
11
+ get_link_titles
12
+ get_intrs
13
+ browser_close
14
+ end
15
+
16
+ def get_link_titles
17
+ @link_titles = get_tags_attrs_from(@url, @auchor_selector, 'href', 'text')
18
+ end
19
+
20
+ def get_intrs
21
+ raise "#{@link_titles} is nil" if @link_titles.nil?
22
+ @link_title_intrs = @link_titles.dup
23
+ @link_title_intrs.each do |link_content|
24
+ if link_content[0][/http/]
25
+ link = link_content[0]
26
+ else
27
+ link = "#{@home_url}#{link_content[0]}"
28
+ end
29
+
30
+ link_content << intr = get_tags_attrs_from(link, @intr_selector, 'text')
31
+ raise "crawler find multi intrs according #{@intr_selector}, please make @intr_selector more exactly, error intr is #{intr}" unless intr.is_a? String
32
+ link_content
33
+ end
34
+ end
35
+
36
+ def handler(&blk)
37
+ handle @link_title_intrs, &blk
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,18 @@
1
+ #require 'nokogiri'
2
+ #module RoRoCrawler
3
+ # module Methods
4
+ # include ::RoRoSupport::Crawler
5
+ # def get_intr_from(html)
6
+ # doc = Nokogiri::HTML.parse html
7
+ # doc.css('.jobIntro, .j_i')
8
+ # .gsub(/(本站提醒:如何识别虚假招聘信息?求职必看,切勿受骗上当!)|(如何写一份简单、直接、高效的求职信?)/, '')
9
+ # end
10
+ #
11
+ # def get_links_contents_from(url, selector)
12
+ # as = @browser.css @link_selector
13
+ # unless as.nil?
14
+ # @offers = as.collect('text', 'href')
15
+ # end
16
+ # end
17
+ # end
18
+ #end
@@ -1,3 +1,3 @@
1
- module Crawler
2
- VERSION = "0.0.3"
1
+ module RoRoCrawler
2
+ VERSION = "0.0.4"
3
3
  end
@@ -0,0 +1,68 @@
1
+ require 'spec_helper'
2
+
3
+ describe ::RoRoCrawler::Base do
4
+ before do
5
+ #Headless.new.start
6
+ end
7
+ it 'spider yjs' do
8
+ url = 'http://s.yingjiesheng.com/result.jsp?keyword=%E5%89%8D%E7%AB%AF%E5%AE%9E%E4%B9%A0&city=0&jobtype=0&do=1&stype=0'
9
+ expect(
10
+ ::RoRoCrawler::Base.new.spider(url, 'h3.title>a', '.job, .j_i')
11
+ ).not_to raise_error
12
+ end
13
+
14
+ describe 'spider v2ex' do
15
+ before do
16
+ @v2ex_spider = Base.new
17
+ end
18
+
19
+ it 'case1' do
20
+ url = 'http://www.v2ex.com/go/jobs?p=1'
21
+ expect(
22
+ ::RoRoCrawler::Base.new.spider(url, 'span.item_title>a', '.topic_content')
23
+ ).not_to raise_error
24
+ end
25
+
26
+ it 'case2' do
27
+ @v2ex_spider.instance_variable_set(:@link_titles, [
28
+ ["/t/79990#reply63", "[北京/杭州] 阿里巴巴2014校园招聘优秀人才内部同事推荐计划"]
29
+ ])
30
+ @v2ex_spider.instance_variable_set(:@home_url, "http://www.v2ex.com")
31
+ @v2ex_spider.instance_variable_set(:@intr_selector, ".cell>.topic_content")
32
+ @v2ex_spider.get_intrs
33
+ end
34
+ end
35
+
36
+ it 'spider ruby-china' do
37
+ url = 'http://ruby-china.org/topics/node25'
38
+ expect(
39
+ ::RoRoCrawler::Base.new.spider(url, '.title>a', '.entry_content')
40
+ ).not_to raise_error
41
+ end
42
+
43
+ it 'spider ruby-china intr not nil' do
44
+ expect(
45
+ get_tags_attrs_from('http://ruby-china.org/topics/13700', '.body.entry-content', 'inner_html')
46
+ ).not_to be_empty
47
+ end
48
+
49
+ it 'handler' do
50
+ ::RoRoCrawler::Base.class_eval do
51
+ def handler(&blk)
52
+ results = [
53
+ [1, 2, 3],
54
+ [4, 5, 6],
55
+ [7, 8, 9]
56
+ ]
57
+
58
+ handle results, &blk
59
+ end
60
+ end
61
+
62
+ ::RoRoCrawler::Base.new.handler do |a, b, c|
63
+ expect(a.to_s).to match /^\d$/
64
+ expect(b.to_s).to match /^\d$/
65
+ expect(c.to_s).to match /^\d$/
66
+ end
67
+ end
68
+ end
@@ -0,0 +1 @@
1
+ require 'spec_helper'
@@ -1,13 +1,17 @@
1
- # This file was generated by the `rspec --init` command. Conventionally, all
2
- # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
- # Require this file using `require "spec_helper"` to ensure that it is only
4
- # loaded once.
5
- #
6
- # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
1
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
7
2
  Bundler.require
8
- require 'active_support'
9
- require File.expand_path('../../lib/roro_crawler', __FILE__)
3
+ require 'roro_support'
4
+ require 'roro_crawler'
5
+ include ::RoRoSupport::Crawler
6
+ include ::RoRoCrawler
10
7
  RSpec.configure do |config|
11
- config.before :all do
12
- end
8
+ config.treat_symbols_as_metadata_keys_with_true_values = true
9
+ config.run_all_when_everything_filtered = true
10
+ config.filter_run :focus
11
+
12
+ # Run specs in random order to surface order dependencies. If you find an
13
+ # order dependency and want to debug it, you can fix the order by providing
14
+ # the seed, which is printed after each run.
15
+ # --seed 1234
16
+ config.order = 'random'
13
17
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: roro_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - roro
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-08-25 00:00:00.000000000 Z
11
+ date: 2013-09-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rails
@@ -24,26 +24,81 @@ dependencies:
24
24
  - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: 4.0.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: watir-rails
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: headless
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: grit
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
27
83
  description: ''
28
84
  email:
29
- - roro@gmail.com
85
+ - zhuxingruo3@gmail.com
30
86
  executables: []
31
87
  extensions: []
32
88
  extra_rdoc_files: []
33
89
  files:
34
- - lib/roro_crawler/crawler_handler.rb
35
90
  - lib/roro_crawler/version.rb
36
- - lib/roro_crawler/crawler_class.rb
91
+ - lib/roro_crawler/base.rb
92
+ - lib/roro_crawler/methods.rb
37
93
  - lib/roro_crawler.rb
38
- - lib/tasks/crawler_tasks.rake
39
94
  - MIT-LICENSE
40
95
  - Rakefile
41
96
  - README.rdoc
42
97
  - spec/fixtures/intr.html
43
98
  - spec/fixtures/list.html
44
99
  - spec/spec_helper.rb
45
- - spec/crawler_handler_spec.rb
46
- - spec/crawler_class_spec.rb
100
+ - spec/roro_crawler/base_spec.rb
101
+ - spec/roro_crawler_spec.rb
47
102
  homepage: ''
48
103
  licenses: []
49
104
  metadata: {}
@@ -71,5 +126,5 @@ test_files:
71
126
  - spec/fixtures/intr.html
72
127
  - spec/fixtures/list.html
73
128
  - spec/spec_helper.rb
74
- - spec/crawler_handler_spec.rb
75
- - spec/crawler_class_spec.rb
129
+ - spec/roro_crawler/base_spec.rb
130
+ - spec/roro_crawler_spec.rb
@@ -1,98 +0,0 @@
1
- require 'roro_support'
2
- require 'active_support'
3
- require File.expand_path('../crawler_handler', __FILE__)
4
-
5
- include Crawler
6
-
7
- module Crawler
8
- class Klass
9
- attr_accessor :offers, :browser, :link, :title, :intr, :page
10
-
11
- def initialize(options={})
12
- @visible = options[:visible]
13
- @link_selector = 'h3.title>a'
14
- @offers = Hash.new
15
- @page = 0
16
-
17
- if @visible
18
- @browser = crawler visible: @visible
19
- else
20
- @browser = crawler
21
- end
22
- end
23
-
24
- def goto_next
25
- @page += 1
26
- @browser.goto "http://s.yingjiesheng.com/result.jsp?keyword=web&start=#{@page*10}&period=0&sort=score&jobtype=0"
27
- end
28
-
29
- def site(keyword, page_num, options={})
30
- if url.nil?
31
- p <<-MSG
32
- please add
33
- def site
34
- url = 'http://website.com'
35
- super
36
- end
37
- MSG
38
- end
39
- @browser.goto url
40
-
41
- page_num.to_i.times do
42
- links
43
- link_contents
44
- "http://s.yingjiesheng.com/result.jsp?keyword=web&start=#{@page*10}&period=0&sort=score&jobtype=0"
45
- goto_next
46
- end
47
-
48
- @browser.close
49
- end
50
-
51
-
52
- def close
53
- @browser.close
54
- end
55
-
56
-
57
- def links
58
- as = @browser.css @link_selector
59
- unless as.nil?
60
- @offers = as.
61
-
62
- collect('text', 'href')
63
- end
64
- end
65
-
66
- def link_contents
67
- @offers.each do |title, link|
68
- if title && link
69
- @title = title
70
- @link = link
71
- @intr = msg link
72
- safe_save
73
- end
74
- end
75
- end
76
-
77
- def safe_save
78
- msg = <<-MSG
79
- title: #{@title}
80
- link: #{@link}
81
- MSG
82
-
83
- Rails.logger.info msg
84
-
85
- p @intr
86
- return if @intr.nil?
87
- Offer.create(link: @link, title: @title, intr: @intr, from: 'yjs')
88
- end
89
-
90
-
91
- def msg(href)
92
- if href[/http\:\/\/www\.yingjiesheng\.com\/job\-\w+/]
93
- @browser.goto href
94
- return Handler.get_intr_from(@browser.html)
95
- end
96
- end
97
- end
98
- end
@@ -1,13 +0,0 @@
1
- require 'nokogiri'
2
- module Crawler
3
- module Handler
4
- class << self
5
- def get_intr_from(html)
6
- doc = Nokogiri::HTML.parse html
7
- doc.css('.jobIntro, .j_i')
8
- .text
9
- .gsub(/(本站提醒:如何识别虚假招聘信息?求职必看,切勿受骗上当!)|(如何写一份简单、直接、高效的求职信?)/, '')
10
- end
11
- end
12
- end
13
- end
@@ -1,4 +0,0 @@
1
- # desc "Explaining what the task does"
2
- # task :roro_crawler do
3
- # # Task goes here
4
- # end
@@ -1,23 +0,0 @@
1
- require 'spec_helper'
2
- require 'roro_support'
3
-
4
- describe 'Crawler' do
5
- before do
6
- end
7
-
8
- after do
9
- if @c.browser
10
-
11
- end
12
- end
13
-
14
- describe "links" do
15
- before do
16
-
17
- end
18
-
19
- it 'links can get links correctly' do
20
-
21
- end
22
- end
23
- end
@@ -1,21 +0,0 @@
1
- require 'spec_helper'
2
- include Crawler
3
-
4
- describe "Handler" do
5
- describe 'get_intr_from' do
6
- before :all do
7
- @pass = lambda do
8
- content = File.read(@fixtures[:intr])
9
- intr = Handler::get_intr_from content
10
- print intr
11
- expect(intr.length).to be < 1000
12
- expect(intr).not_to include "本站提醒:如何识别虚假招聘信息?求职必看,切勿受骗上当!"
13
- expect(intr).not_to include "如何写一份简单、直接、高效的求职信?"
14
- end
15
-
16
- end
17
- it 'pass spec1' do
18
- @pass.call
19
- end
20
- end
21
- end