spider_rails 4.0.2 → 4.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +2 -0
- data/lib/spider_rails/{common.rb → common/common.rb} +2 -26
- data/lib/spider_rails/common/dsl.rb +32 -0
- data/lib/spider_rails/common/rspec.rb +5 -0
- data/lib/spider_rails/{bilibili.rb → specific/bilibili.rb} +2 -2
- data/lib/spider_rails/specific/google_dict.rb +91 -0
- data/lib/spider_rails/{ji_ying.rb → specific/ji_ying.rb} +21 -10
- data/lib/spider_rails/{spread_sheet.rb → specific/spread_sheet.rb} +0 -0
- data/lib/spider_rails/version.rb +1 -1
- data/lib/spider_rails.rb +1 -1
- metadata +107 -9
- data/lib/spider_rails/google_dict.rb +0 -91
- data/lib/spider_rails/hl.rb +0 -13
- data/lib/spider_rails/sample_data.rb +0 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e9ae14d4aa7c198c8e42c9075b9cf76457c1d6a9
|
4
|
+
data.tar.gz: 1177b0a8dfe1c3715050a9bd5dcb25f14daaf32d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3b1573e5ebed6b5bce30e04f715a0058d59d3eeaf8fad387a93fe8fecd71334a471cd946aee1d7f150f865ad8bc8c6ace4672b9baa2ceaaeff302eb5b2b44778
|
7
|
+
data.tar.gz: 29b0d0bcccd465fc61f11e704e655e505d024d2ac9646d415398cfea67c93a665ff33af37ba6b2f73133b9a1830346d1013a21e7d4ea99f3c862b9f13e61a321
|
data/Rakefile
CHANGED
@@ -1,28 +1,5 @@
|
|
1
|
-
module
|
1
|
+
module Common
|
2
2
|
class Common
|
3
|
-
def full_site
|
4
|
-
|
5
|
-
end
|
6
|
-
|
7
|
-
def full_site_filter
|
8
|
-
|
9
|
-
end
|
10
|
-
|
11
|
-
def full_page
|
12
|
-
close_all_chromes
|
13
|
-
end
|
14
|
-
|
15
|
-
def get element
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
def single(element)
|
20
|
-
|
21
|
-
end
|
22
|
-
|
23
|
-
def single_filter
|
24
|
-
end
|
25
|
-
|
26
3
|
def get_content(element, selector, &block)
|
27
4
|
begin
|
28
5
|
if block_given?
|
@@ -37,11 +14,10 @@ module Spider
|
|
37
14
|
end
|
38
15
|
end
|
39
16
|
end
|
40
|
-
|
41
17
|
end
|
42
18
|
|
43
19
|
class << self
|
44
|
-
def
|
20
|
+
def start driver, url
|
45
21
|
#@browser = Watir::Browser.new :chrome, switches: %w( --user-data-dir=/home/zxr/.config/google-chrome)
|
46
22
|
@browser = Watir::Browser.new driver
|
47
23
|
@browser.goto url
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'watir-webdriver'
|
2
|
+
require 'nokogiri'
|
3
|
+
module DSL
|
4
|
+
# Add a visit method to ::Watir::Browser
|
5
|
+
class Browser < ::Watir::Browser
|
6
|
+
def visit(relative_url = nil, base_url = 'http://localhost:3000/')
|
7
|
+
goto("#{base_url}#{relative_url}")
|
8
|
+
end
|
9
|
+
|
10
|
+
def initialize(browser = :phantomjs, *args)
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
def dsl_enable
|
15
|
+
@doc = Nokogiri::HTML.parse(self.html)
|
16
|
+
::String.class_variable_set(:@@doc, @doc)
|
17
|
+
eval <<-RUBY
|
18
|
+
class ::String
|
19
|
+
def ctn
|
20
|
+
if block_given?
|
21
|
+
@@doc.css(self) &block
|
22
|
+
else
|
23
|
+
@@doc.css(self).each do |e|
|
24
|
+
return e.text
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
RUBY
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Specific
|
2
|
+
class GoogleDict
|
3
|
+
def get_cards(keywords)
|
4
|
+
Headless.new.start
|
5
|
+
keywords.each do |keyword|
|
6
|
+
unless Card.find_by_word(keyword)
|
7
|
+
get_card keyword
|
8
|
+
save_record(Card, word: @card[:Word],
|
9
|
+
voice: @card[:Voice],
|
10
|
+
verb: @card[:Verb],
|
11
|
+
adj: @card[:Adjective],
|
12
|
+
noun: @card[:Noun],
|
13
|
+
pronoun: @card[:Pronoun],
|
14
|
+
synonyms: @card[:Synonyms],
|
15
|
+
abbr: @card[:Abbreviation],
|
16
|
+
prep: @card[:Preposition],
|
17
|
+
conj: @card[:Conjunction]
|
18
|
+
)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def get_keywords(path)
|
24
|
+
f = File.new(path)
|
25
|
+
dict = f.read.split(/\W/)
|
26
|
+
dict.delete("")
|
27
|
+
dict.uniq!
|
28
|
+
dict
|
29
|
+
end
|
30
|
+
|
31
|
+
class << self
|
32
|
+
def alias_methods(*args)
|
33
|
+
args.each do |arg|
|
34
|
+
alias_method arg, args.last
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def get_card keyword
|
40
|
+
@card = Hash.new
|
41
|
+
@page = start "https://www.google.com.hk/search?newwindow=1&safe=strict&q=#{keyword}+define&oq=#{keyword}+define"
|
42
|
+
|
43
|
+
doc = Nokogiri::HTML.parse @page.html
|
44
|
+
|
45
|
+
GoogleDict.alias_methods :card, :voice, :word, :get_content
|
46
|
+
card(doc, 'li.dct') do |c|
|
47
|
+
@card[:Word] = keyword.downcase
|
48
|
+
@card[:Voice] = voice(c, 'h3+.vk_sh')
|
49
|
+
|
50
|
+
# Get word explainations
|
51
|
+
get_explain(c)
|
52
|
+
end
|
53
|
+
|
54
|
+
@page.close
|
55
|
+
@card.delete(0)
|
56
|
+
@card
|
57
|
+
end
|
58
|
+
|
59
|
+
def get_explain(c)
|
60
|
+
type_nodes = c.css('div.vk_gy.vk_sh').to_a
|
61
|
+
content_nodes = c.css('div.vk_gy.vk_sh+div').to_a
|
62
|
+
type_nodes.each_with_index do |t, i|
|
63
|
+
table = content_nodes[i]
|
64
|
+
if table.css('li').count >= 2
|
65
|
+
fin_content = Array.new
|
66
|
+
table.css('li').each do |l|
|
67
|
+
fin_content << l.content
|
68
|
+
end
|
69
|
+
else
|
70
|
+
fin_content = table.content
|
71
|
+
end
|
72
|
+
@card[t.text.to_sym] = fin_content
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
#def login(username, password)
|
77
|
+
# element?('a.gbgt#gb_70') { |e| e.click }
|
78
|
+
# @b.text_field(name: 'Email').set username
|
79
|
+
# @b.text_field(name: 'Passwd').set password
|
80
|
+
# element?('input#signIn') { |e| e.click }
|
81
|
+
#end
|
82
|
+
|
83
|
+
def element?(selector, &block)
|
84
|
+
e = @page.element(css: selector)
|
85
|
+
if yield e
|
86
|
+
else
|
87
|
+
'element is nil'
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -1,5 +1,10 @@
|
|
1
|
-
module
|
2
|
-
|
1
|
+
module Specific
|
2
|
+
# Download JiYing resources
|
3
|
+
# example:
|
4
|
+
# @page = ::Spider.open_browser(:phantomjs, 'http://bt.ktxp.com/sort-50-1.html')
|
5
|
+
# jy = ::Spider::JiYing.new(@page)
|
6
|
+
# jy.full_site
|
7
|
+
class JiYing
|
3
8
|
attr_accessor :ani, :anis, :page
|
4
9
|
|
5
10
|
def initialize page
|
@@ -21,19 +26,23 @@ module Spider
|
|
21
26
|
(1..fp).each do |page_num|
|
22
27
|
full_page page_num
|
23
28
|
end
|
29
|
+
p_anis
|
24
30
|
rescue Exception
|
25
31
|
raise %Q(page isn't not exist)
|
26
32
|
end
|
27
33
|
end
|
28
34
|
|
29
35
|
def multi_pages final_page_num
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
end
|
36
|
+
(1..final_page_num).each do |page_num|
|
37
|
+
full_page page_num
|
38
|
+
p_anis
|
34
39
|
end
|
35
40
|
end
|
36
41
|
|
42
|
+
def p_anis
|
43
|
+
p "@anis is #{@anis}"
|
44
|
+
end
|
45
|
+
|
37
46
|
def final_page
|
38
47
|
if @mode == 'search'
|
39
48
|
fp = @page.element(css: '.title h2 a').text[/\(.+\)/].gsub!(/\(|\)/, '').to_i/100 + 1
|
@@ -51,7 +60,7 @@ module Spider
|
|
51
60
|
when 'normal'
|
52
61
|
@page.goto "#{@base_url}#{page_num}.html"
|
53
62
|
end
|
54
|
-
html = Nokogiri::HTML.parse @page.html
|
63
|
+
html = ::Nokogiri::HTML.parse @page.html
|
55
64
|
|
56
65
|
html.css('.ltext').each do |td|
|
57
66
|
single(td)
|
@@ -70,11 +79,13 @@ module Spider
|
|
70
79
|
@ani[:title],
|
71
80
|
@ani[:size],
|
72
81
|
@ani[:finish] = get_content(element, 'a.quick-down+a', 'td.ltext+td', 'td.ltext+td+td+td+td')
|
73
|
-
|
82
|
+
p "Get Animation: #{@ani[:title]}"
|
83
|
+
|
84
|
+
@anis << @ani.dup
|
74
85
|
end
|
75
86
|
|
76
|
-
def
|
77
|
-
|
87
|
+
def ani_count
|
88
|
+
@anis.uniq.count if @anis
|
78
89
|
end
|
79
90
|
|
80
91
|
def get_content(element, *selectors)
|
File without changes
|
data/lib/spider_rails/version.rb
CHANGED
data/lib/spider_rails.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
2
|
current_file_name = __FILE__.split('/').last.gsub('.rb', '')
|
3
|
-
Dir[File.expand_path("../#{current_file_name}
|
3
|
+
Dir[File.expand_path("../#{current_file_name}/**/*.rb", __FILE__)].each { |file| require file }
|
4
4
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spider_rails
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.0.
|
4
|
+
version: 4.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- zhuxingruo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-07-
|
11
|
+
date: 2013-07-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -38,6 +38,104 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: watir
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: nokogiri
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: headless
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rspec
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: guard
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: guard-rspec
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - '>='
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: spork
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - '>='
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - '>='
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
41
139
|
description: nil
|
42
140
|
email:
|
43
141
|
- zhuxingruo3@gmail.com
|
@@ -45,14 +143,14 @@ executables: []
|
|
45
143
|
extensions: []
|
46
144
|
extra_rdoc_files: []
|
47
145
|
files:
|
48
|
-
- lib/spider_rails/common.rb
|
49
146
|
- lib/spider_rails/version.rb
|
50
|
-
- lib/spider_rails/bilibili.rb
|
51
|
-
- lib/spider_rails/
|
52
|
-
- lib/spider_rails/
|
53
|
-
- lib/spider_rails/
|
54
|
-
- lib/spider_rails/
|
55
|
-
- lib/spider_rails/
|
147
|
+
- lib/spider_rails/specific/bilibili.rb
|
148
|
+
- lib/spider_rails/specific/spread_sheet.rb
|
149
|
+
- lib/spider_rails/specific/google_dict.rb
|
150
|
+
- lib/spider_rails/specific/ji_ying.rb
|
151
|
+
- lib/spider_rails/common/common.rb
|
152
|
+
- lib/spider_rails/common/rspec.rb
|
153
|
+
- lib/spider_rails/common/dsl.rb
|
56
154
|
- lib/spider_rails.rb
|
57
155
|
- MIT-LICENSE
|
58
156
|
- Rakefile
|
@@ -1,91 +0,0 @@
|
|
1
|
-
module Spider
|
2
|
-
class GoogleDict < Common
|
3
|
-
def get_cards(keywords)
|
4
|
-
Headless.new.start
|
5
|
-
keywords.each do |keyword|
|
6
|
-
unless Card.find_by_word(keyword)
|
7
|
-
get_card keyword
|
8
|
-
save_record(Card, word: @card[:Word],
|
9
|
-
voice: @card[:Voice],
|
10
|
-
verb: @card[:Verb],
|
11
|
-
adj: @card[:Adjective],
|
12
|
-
noun: @card[:Noun],
|
13
|
-
pronoun: @card[:Pronoun],
|
14
|
-
synonyms: @card[:Synonyms],
|
15
|
-
abbr: @card[:Abbreviation],
|
16
|
-
prep: @card[:Preposition],
|
17
|
-
conj: @card[:Conjunction]
|
18
|
-
)
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def get_keywords(path)
|
24
|
-
f = File.new(path)
|
25
|
-
dict = f.read.split(/\W/)
|
26
|
-
dict.delete("")
|
27
|
-
dict.uniq!
|
28
|
-
dict
|
29
|
-
end
|
30
|
-
|
31
|
-
class << self
|
32
|
-
def alias_methods(*args)
|
33
|
-
args.each do |arg|
|
34
|
-
alias_method arg, args.last
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def get_card keyword
|
40
|
-
@card = Hash.new
|
41
|
-
@page = open_browser "https://www.google.com.hk/search?newwindow=1&safe=strict&q=#{keyword}+define&oq=#{keyword}+define"
|
42
|
-
|
43
|
-
doc = Nokogiri::HTML.parse @page.html
|
44
|
-
|
45
|
-
GoogleDict.alias_methods :card, :voice, :word, :get_content
|
46
|
-
card(doc, 'li.dct') do |c|
|
47
|
-
@card[:Word] = keyword.downcase
|
48
|
-
@card[:Voice] = voice(c, 'h3+.vk_sh')
|
49
|
-
|
50
|
-
# Get word explainations
|
51
|
-
get_explain(c)
|
52
|
-
end
|
53
|
-
|
54
|
-
@page.close
|
55
|
-
@card.delete(0)
|
56
|
-
@card
|
57
|
-
end
|
58
|
-
|
59
|
-
def get_explain(c)
|
60
|
-
type_nodes = c.css('div.vk_gy.vk_sh').to_a
|
61
|
-
content_nodes = c.css('div.vk_gy.vk_sh+div').to_a
|
62
|
-
type_nodes.each_with_index do |t, i|
|
63
|
-
table = content_nodes[i]
|
64
|
-
if table.css('li').count >= 2
|
65
|
-
fin_content = Array.new
|
66
|
-
table.css('li').each do |l|
|
67
|
-
fin_content << l.content
|
68
|
-
end
|
69
|
-
else
|
70
|
-
fin_content = table.content
|
71
|
-
end
|
72
|
-
@card[t.text.to_sym] = fin_content
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
#def login(username, password)
|
77
|
-
# element?('a.gbgt#gb_70') { |e| e.click }
|
78
|
-
# @b.text_field(name: 'Email').set username
|
79
|
-
# @b.text_field(name: 'Passwd').set password
|
80
|
-
# element?('input#signIn') { |e| e.click }
|
81
|
-
#end
|
82
|
-
|
83
|
-
def element?(selector, &block)
|
84
|
-
e = @page.element(css: selector)
|
85
|
-
if yield e
|
86
|
-
else
|
87
|
-
'element is nil'
|
88
|
-
end
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
data/lib/spider_rails/hl.rb
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
module Lib
|
2
|
-
module Hl
|
3
|
-
class << self
|
4
|
-
def run
|
5
|
-
h = Headless.new
|
6
|
-
h.start
|
7
|
-
b = Watir::Browser.new :chrome, switches: %w[--proxy-server=socks5://127.0.0.1:7070]
|
8
|
-
b.goto 'https://www.google.com.hk/search?q=google+define&oq=google+define'
|
9
|
-
p b.title
|
10
|
-
end
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
@@ -1,10 +0,0 @@
|
|
1
|
-
module Spider
|
2
|
-
class SampleData < Common
|
3
|
-
def generate
|
4
|
-
100.times do |n|
|
5
|
-
Novel.create(title: "やめて#{n}", content: 'やめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめて')
|
6
|
-
Card.create(word: "やめて#{n}", voice: 'やめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめて')
|
7
|
-
end
|
8
|
-
end
|
9
|
-
end
|
10
|
-
end
|