spider_rails 4.0.2 → 4.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +2 -0
- data/lib/spider_rails/{common.rb → common/common.rb} +2 -26
- data/lib/spider_rails/common/dsl.rb +32 -0
- data/lib/spider_rails/common/rspec.rb +5 -0
- data/lib/spider_rails/{bilibili.rb → specific/bilibili.rb} +2 -2
- data/lib/spider_rails/specific/google_dict.rb +91 -0
- data/lib/spider_rails/{ji_ying.rb → specific/ji_ying.rb} +21 -10
- data/lib/spider_rails/{spread_sheet.rb → specific/spread_sheet.rb} +0 -0
- data/lib/spider_rails/version.rb +1 -1
- data/lib/spider_rails.rb +1 -1
- metadata +107 -9
- data/lib/spider_rails/google_dict.rb +0 -91
- data/lib/spider_rails/hl.rb +0 -13
- data/lib/spider_rails/sample_data.rb +0 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e9ae14d4aa7c198c8e42c9075b9cf76457c1d6a9
|
4
|
+
data.tar.gz: 1177b0a8dfe1c3715050a9bd5dcb25f14daaf32d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3b1573e5ebed6b5bce30e04f715a0058d59d3eeaf8fad387a93fe8fecd71334a471cd946aee1d7f150f865ad8bc8c6ace4672b9baa2ceaaeff302eb5b2b44778
|
7
|
+
data.tar.gz: 29b0d0bcccd465fc61f11e704e655e505d024d2ac9646d415398cfea67c93a665ff33af37ba6b2f73133b9a1830346d1013a21e7d4ea99f3c862b9f13e61a321
|
data/Rakefile
CHANGED
@@ -1,28 +1,5 @@
|
|
1
|
-
module
|
1
|
+
module Common
|
2
2
|
class Common
|
3
|
-
def full_site
|
4
|
-
|
5
|
-
end
|
6
|
-
|
7
|
-
def full_site_filter
|
8
|
-
|
9
|
-
end
|
10
|
-
|
11
|
-
def full_page
|
12
|
-
close_all_chromes
|
13
|
-
end
|
14
|
-
|
15
|
-
def get element
|
16
|
-
|
17
|
-
end
|
18
|
-
|
19
|
-
def single(element)
|
20
|
-
|
21
|
-
end
|
22
|
-
|
23
|
-
def single_filter
|
24
|
-
end
|
25
|
-
|
26
3
|
def get_content(element, selector, &block)
|
27
4
|
begin
|
28
5
|
if block_given?
|
@@ -37,11 +14,10 @@ module Spider
|
|
37
14
|
end
|
38
15
|
end
|
39
16
|
end
|
40
|
-
|
41
17
|
end
|
42
18
|
|
43
19
|
class << self
|
44
|
-
def
|
20
|
+
def start driver, url
|
45
21
|
#@browser = Watir::Browser.new :chrome, switches: %w( --user-data-dir=/home/zxr/.config/google-chrome)
|
46
22
|
@browser = Watir::Browser.new driver
|
47
23
|
@browser.goto url
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'watir-webdriver'
|
2
|
+
require 'nokogiri'
|
3
|
+
module DSL
|
4
|
+
# Add a visit method to ::Watir::Browser
|
5
|
+
class Browser < ::Watir::Browser
|
6
|
+
def visit(relative_url = nil, base_url = 'http://localhost:3000/')
|
7
|
+
goto("#{base_url}#{relative_url}")
|
8
|
+
end
|
9
|
+
|
10
|
+
def initialize(browser = :phantomjs, *args)
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
def dsl_enable
|
15
|
+
@doc = Nokogiri::HTML.parse(self.html)
|
16
|
+
::String.class_variable_set(:@@doc, @doc)
|
17
|
+
eval <<-RUBY
|
18
|
+
class ::String
|
19
|
+
def ctn
|
20
|
+
if block_given?
|
21
|
+
@@doc.css(self) &block
|
22
|
+
else
|
23
|
+
@@doc.css(self).each do |e|
|
24
|
+
return e.text
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
RUBY
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Specific
|
2
|
+
class GoogleDict
|
3
|
+
def get_cards(keywords)
|
4
|
+
Headless.new.start
|
5
|
+
keywords.each do |keyword|
|
6
|
+
unless Card.find_by_word(keyword)
|
7
|
+
get_card keyword
|
8
|
+
save_record(Card, word: @card[:Word],
|
9
|
+
voice: @card[:Voice],
|
10
|
+
verb: @card[:Verb],
|
11
|
+
adj: @card[:Adjective],
|
12
|
+
noun: @card[:Noun],
|
13
|
+
pronoun: @card[:Pronoun],
|
14
|
+
synonyms: @card[:Synonyms],
|
15
|
+
abbr: @card[:Abbreviation],
|
16
|
+
prep: @card[:Preposition],
|
17
|
+
conj: @card[:Conjunction]
|
18
|
+
)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def get_keywords(path)
|
24
|
+
f = File.new(path)
|
25
|
+
dict = f.read.split(/\W/)
|
26
|
+
dict.delete("")
|
27
|
+
dict.uniq!
|
28
|
+
dict
|
29
|
+
end
|
30
|
+
|
31
|
+
class << self
|
32
|
+
def alias_methods(*args)
|
33
|
+
args.each do |arg|
|
34
|
+
alias_method arg, args.last
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def get_card keyword
|
40
|
+
@card = Hash.new
|
41
|
+
@page = start "https://www.google.com.hk/search?newwindow=1&safe=strict&q=#{keyword}+define&oq=#{keyword}+define"
|
42
|
+
|
43
|
+
doc = Nokogiri::HTML.parse @page.html
|
44
|
+
|
45
|
+
GoogleDict.alias_methods :card, :voice, :word, :get_content
|
46
|
+
card(doc, 'li.dct') do |c|
|
47
|
+
@card[:Word] = keyword.downcase
|
48
|
+
@card[:Voice] = voice(c, 'h3+.vk_sh')
|
49
|
+
|
50
|
+
# Get word explainations
|
51
|
+
get_explain(c)
|
52
|
+
end
|
53
|
+
|
54
|
+
@page.close
|
55
|
+
@card.delete(0)
|
56
|
+
@card
|
57
|
+
end
|
58
|
+
|
59
|
+
def get_explain(c)
|
60
|
+
type_nodes = c.css('div.vk_gy.vk_sh').to_a
|
61
|
+
content_nodes = c.css('div.vk_gy.vk_sh+div').to_a
|
62
|
+
type_nodes.each_with_index do |t, i|
|
63
|
+
table = content_nodes[i]
|
64
|
+
if table.css('li').count >= 2
|
65
|
+
fin_content = Array.new
|
66
|
+
table.css('li').each do |l|
|
67
|
+
fin_content << l.content
|
68
|
+
end
|
69
|
+
else
|
70
|
+
fin_content = table.content
|
71
|
+
end
|
72
|
+
@card[t.text.to_sym] = fin_content
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
#def login(username, password)
|
77
|
+
# element?('a.gbgt#gb_70') { |e| e.click }
|
78
|
+
# @b.text_field(name: 'Email').set username
|
79
|
+
# @b.text_field(name: 'Passwd').set password
|
80
|
+
# element?('input#signIn') { |e| e.click }
|
81
|
+
#end
|
82
|
+
|
83
|
+
def element?(selector, &block)
|
84
|
+
e = @page.element(css: selector)
|
85
|
+
if yield e
|
86
|
+
else
|
87
|
+
'element is nil'
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -1,5 +1,10 @@
|
|
1
|
-
module
|
2
|
-
|
1
|
+
module Specific
|
2
|
+
# Download JiYing resources
|
3
|
+
# example:
|
4
|
+
# @page = ::Spider.open_browser(:phantomjs, 'http://bt.ktxp.com/sort-50-1.html')
|
5
|
+
# jy = ::Spider::JiYing.new(@page)
|
6
|
+
# jy.full_site
|
7
|
+
class JiYing
|
3
8
|
attr_accessor :ani, :anis, :page
|
4
9
|
|
5
10
|
def initialize page
|
@@ -21,19 +26,23 @@ module Spider
|
|
21
26
|
(1..fp).each do |page_num|
|
22
27
|
full_page page_num
|
23
28
|
end
|
29
|
+
p_anis
|
24
30
|
rescue Exception
|
25
31
|
raise %Q(page isn't not exist)
|
26
32
|
end
|
27
33
|
end
|
28
34
|
|
29
35
|
def multi_pages final_page_num
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
end
|
36
|
+
(1..final_page_num).each do |page_num|
|
37
|
+
full_page page_num
|
38
|
+
p_anis
|
34
39
|
end
|
35
40
|
end
|
36
41
|
|
42
|
+
def p_anis
|
43
|
+
p "@anis is #{@anis}"
|
44
|
+
end
|
45
|
+
|
37
46
|
def final_page
|
38
47
|
if @mode == 'search'
|
39
48
|
fp = @page.element(css: '.title h2 a').text[/\(.+\)/].gsub!(/\(|\)/, '').to_i/100 + 1
|
@@ -51,7 +60,7 @@ module Spider
|
|
51
60
|
when 'normal'
|
52
61
|
@page.goto "#{@base_url}#{page_num}.html"
|
53
62
|
end
|
54
|
-
html = Nokogiri::HTML.parse @page.html
|
63
|
+
html = ::Nokogiri::HTML.parse @page.html
|
55
64
|
|
56
65
|
html.css('.ltext').each do |td|
|
57
66
|
single(td)
|
@@ -70,11 +79,13 @@ module Spider
|
|
70
79
|
@ani[:title],
|
71
80
|
@ani[:size],
|
72
81
|
@ani[:finish] = get_content(element, 'a.quick-down+a', 'td.ltext+td', 'td.ltext+td+td+td+td')
|
73
|
-
|
82
|
+
p "Get Animation: #{@ani[:title]}"
|
83
|
+
|
84
|
+
@anis << @ani.dup
|
74
85
|
end
|
75
86
|
|
76
|
-
def
|
77
|
-
|
87
|
+
def ani_count
|
88
|
+
@anis.uniq.count if @anis
|
78
89
|
end
|
79
90
|
|
80
91
|
def get_content(element, *selectors)
|
File without changes
|
data/lib/spider_rails/version.rb
CHANGED
data/lib/spider_rails.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
2
|
current_file_name = __FILE__.split('/').last.gsub('.rb', '')
|
3
|
-
Dir[File.expand_path("../#{current_file_name}
|
3
|
+
Dir[File.expand_path("../#{current_file_name}/**/*.rb", __FILE__)].each { |file| require file }
|
4
4
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spider_rails
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.0.
|
4
|
+
version: 4.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- zhuxingruo
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-07-
|
11
|
+
date: 2013-07-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -38,6 +38,104 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: watir
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: nokogiri
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: headless
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rspec
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - '>='
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: guard
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: guard-rspec
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - '>='
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: spork
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - '>='
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - '>='
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
41
139
|
description: nil
|
42
140
|
email:
|
43
141
|
- zhuxingruo3@gmail.com
|
@@ -45,14 +143,14 @@ executables: []
|
|
45
143
|
extensions: []
|
46
144
|
extra_rdoc_files: []
|
47
145
|
files:
|
48
|
-
- lib/spider_rails/common.rb
|
49
146
|
- lib/spider_rails/version.rb
|
50
|
-
- lib/spider_rails/bilibili.rb
|
51
|
-
- lib/spider_rails/
|
52
|
-
- lib/spider_rails/
|
53
|
-
- lib/spider_rails/
|
54
|
-
- lib/spider_rails/
|
55
|
-
- lib/spider_rails/
|
147
|
+
- lib/spider_rails/specific/bilibili.rb
|
148
|
+
- lib/spider_rails/specific/spread_sheet.rb
|
149
|
+
- lib/spider_rails/specific/google_dict.rb
|
150
|
+
- lib/spider_rails/specific/ji_ying.rb
|
151
|
+
- lib/spider_rails/common/common.rb
|
152
|
+
- lib/spider_rails/common/rspec.rb
|
153
|
+
- lib/spider_rails/common/dsl.rb
|
56
154
|
- lib/spider_rails.rb
|
57
155
|
- MIT-LICENSE
|
58
156
|
- Rakefile
|
@@ -1,91 +0,0 @@
|
|
1
|
-
module Spider
|
2
|
-
class GoogleDict < Common
|
3
|
-
def get_cards(keywords)
|
4
|
-
Headless.new.start
|
5
|
-
keywords.each do |keyword|
|
6
|
-
unless Card.find_by_word(keyword)
|
7
|
-
get_card keyword
|
8
|
-
save_record(Card, word: @card[:Word],
|
9
|
-
voice: @card[:Voice],
|
10
|
-
verb: @card[:Verb],
|
11
|
-
adj: @card[:Adjective],
|
12
|
-
noun: @card[:Noun],
|
13
|
-
pronoun: @card[:Pronoun],
|
14
|
-
synonyms: @card[:Synonyms],
|
15
|
-
abbr: @card[:Abbreviation],
|
16
|
-
prep: @card[:Preposition],
|
17
|
-
conj: @card[:Conjunction]
|
18
|
-
)
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def get_keywords(path)
|
24
|
-
f = File.new(path)
|
25
|
-
dict = f.read.split(/\W/)
|
26
|
-
dict.delete("")
|
27
|
-
dict.uniq!
|
28
|
-
dict
|
29
|
-
end
|
30
|
-
|
31
|
-
class << self
|
32
|
-
def alias_methods(*args)
|
33
|
-
args.each do |arg|
|
34
|
-
alias_method arg, args.last
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def get_card keyword
|
40
|
-
@card = Hash.new
|
41
|
-
@page = open_browser "https://www.google.com.hk/search?newwindow=1&safe=strict&q=#{keyword}+define&oq=#{keyword}+define"
|
42
|
-
|
43
|
-
doc = Nokogiri::HTML.parse @page.html
|
44
|
-
|
45
|
-
GoogleDict.alias_methods :card, :voice, :word, :get_content
|
46
|
-
card(doc, 'li.dct') do |c|
|
47
|
-
@card[:Word] = keyword.downcase
|
48
|
-
@card[:Voice] = voice(c, 'h3+.vk_sh')
|
49
|
-
|
50
|
-
# Get word explainations
|
51
|
-
get_explain(c)
|
52
|
-
end
|
53
|
-
|
54
|
-
@page.close
|
55
|
-
@card.delete(0)
|
56
|
-
@card
|
57
|
-
end
|
58
|
-
|
59
|
-
def get_explain(c)
|
60
|
-
type_nodes = c.css('div.vk_gy.vk_sh').to_a
|
61
|
-
content_nodes = c.css('div.vk_gy.vk_sh+div').to_a
|
62
|
-
type_nodes.each_with_index do |t, i|
|
63
|
-
table = content_nodes[i]
|
64
|
-
if table.css('li').count >= 2
|
65
|
-
fin_content = Array.new
|
66
|
-
table.css('li').each do |l|
|
67
|
-
fin_content << l.content
|
68
|
-
end
|
69
|
-
else
|
70
|
-
fin_content = table.content
|
71
|
-
end
|
72
|
-
@card[t.text.to_sym] = fin_content
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
#def login(username, password)
|
77
|
-
# element?('a.gbgt#gb_70') { |e| e.click }
|
78
|
-
# @b.text_field(name: 'Email').set username
|
79
|
-
# @b.text_field(name: 'Passwd').set password
|
80
|
-
# element?('input#signIn') { |e| e.click }
|
81
|
-
#end
|
82
|
-
|
83
|
-
def element?(selector, &block)
|
84
|
-
e = @page.element(css: selector)
|
85
|
-
if yield e
|
86
|
-
else
|
87
|
-
'element is nil'
|
88
|
-
end
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
data/lib/spider_rails/hl.rb
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
module Lib
|
2
|
-
module Hl
|
3
|
-
class << self
|
4
|
-
def run
|
5
|
-
h = Headless.new
|
6
|
-
h.start
|
7
|
-
b = Watir::Browser.new :chrome, switches: %w[--proxy-server=socks5://127.0.0.1:7070]
|
8
|
-
b.goto 'https://www.google.com.hk/search?q=google+define&oq=google+define'
|
9
|
-
p b.title
|
10
|
-
end
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
@@ -1,10 +0,0 @@
|
|
1
|
-
module Spider
|
2
|
-
class SampleData < Common
|
3
|
-
def generate
|
4
|
-
100.times do |n|
|
5
|
-
Novel.create(title: "やめて#{n}", content: 'やめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめて')
|
6
|
-
Card.create(word: "やめて#{n}", voice: 'やめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめてやめて やめて')
|
7
|
-
end
|
8
|
-
end
|
9
|
-
end
|
10
|
-
end
|