husc 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +6 -9
- data/lib/husc/version.rb +2 -2
- data/lib/husc.rb +154 -161
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a8d3839225b1c0ae71db2371b3858620ac34408c0a209739dbe96e1683437a6f
|
4
|
+
data.tar.gz: 4effb7652a7d0121b835e4ac8f8255305c9eb078ee7acaa245dccd19c819843b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 29d4486ab7cf5de9fb3e757b0b495ac9fbcd88ed389d6696f04a42a8b4e6dfbb841e2435afb3af996d883bcde8ecdca9dde9911be20a7b36cf4009dea4dbdec8
|
7
|
+
data.tar.gz: be53729a52309e9b4dbc36cd82673e0d4f20ba8d9631390776a65c5fe623b6395b03636cffa498c4aed1df8cacd767d0ed6435a2f527cd393b68b4b7694c82f1
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
husc
|
2
|
+
====
|
3
3
|
|
4
4
|
A simple crawling utility for Ruby.
|
5
5
|
|
@@ -19,7 +19,7 @@ This project enables site crawling and data extraction with xpath and css select
|
|
19
19
|
require 'husc'
|
20
20
|
|
21
21
|
url = 'http://www.example.com/'
|
22
|
-
doc = Husc(url)
|
22
|
+
doc = Husc.new(url)
|
23
23
|
|
24
24
|
# access another url
|
25
25
|
doc.get('another url')
|
@@ -32,7 +32,6 @@ doc.html
|
|
32
32
|
|
33
33
|
# get <table> tags as dict
|
34
34
|
doc.tables
|
35
|
-
# ex) doc.tables['予約・お問い合わせ'] => 050-5596-6465
|
36
35
|
```
|
37
36
|
|
38
37
|
### Scraping Example
|
@@ -50,14 +49,14 @@ doc.xpath('//*[@id="top"]/div[1]')
|
|
50
49
|
|
51
50
|
# other example
|
52
51
|
doc.css('div').css('a')[2].attr('href') # => string object
|
53
|
-
doc.css('p').
|
52
|
+
doc.css('p').inner_text() # => string object
|
54
53
|
# You do not need to specify "[]" to access the first index
|
55
54
|
```
|
56
55
|
|
57
56
|
### Submitting Form Example
|
58
57
|
1. Specify target node's attribute
|
59
58
|
2. Specify value(int or str) / check(bool) / file_name(str)
|
60
|
-
3.
|
59
|
+
3. Call submit() with form attribute specified
|
61
60
|
```ruby
|
62
61
|
# login
|
63
62
|
doc.send(id:'id attribute', value:'value to send')
|
@@ -77,8 +76,6 @@ doc.send(class:'class attribute', value:100)
|
|
77
76
|
```
|
78
77
|
|
79
78
|
|
80
|
-
|
81
|
-
|
82
79
|
## Installation
|
83
80
|
```sh
|
84
81
|
$ gem install husc
|
@@ -86,4 +83,4 @@ $ gem install husc
|
|
86
83
|
|
87
84
|
|
88
85
|
## Contributing
|
89
|
-
Bug reports and pull requests are welcome on GitHub at [https://github.com/AjxLab/
|
86
|
+
Bug reports and pull requests are welcome on GitHub at [https://github.com/AjxLab/husc](https://github.com/AjxLab/husc).
|
data/lib/husc/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION = "0.
|
1
|
+
class Husc
|
2
|
+
VERSION = "0.2.0"
|
3
3
|
end
|
data/lib/husc.rb
CHANGED
@@ -5,211 +5,204 @@ require 'net/http'
|
|
5
5
|
require 'kconv'
|
6
6
|
require 'husc/version'
|
7
7
|
|
8
|
-
|
8
|
+
|
9
|
+
class Husc
|
9
10
|
class Error < StandardError; end
|
10
11
|
|
11
|
-
|
12
|
-
attr_reader :url, :html, :tables, :params
|
12
|
+
attr_reader :url, :html, :tables, :params
|
13
13
|
|
14
|
-
|
15
|
-
|
14
|
+
# 特殊配列
|
15
|
+
class CrawlArray < Array
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
def method_missing(method, *args)
|
31
|
-
if self == []
|
32
|
-
return eval("Crawler.new(doc: nil).#{method}(*#{args})")
|
17
|
+
def find(search)
|
18
|
+
## -----*----- 検索 -----*----- ##
|
19
|
+
self.each do |e|
|
20
|
+
if search.keys[0].to_s == 'inner'
|
21
|
+
# inner_textが一致するか
|
22
|
+
return e if e.inner_text == search.values[0]
|
23
|
+
else
|
24
|
+
# 属性が一致するか
|
25
|
+
return e if e.attr(search.keys[0].to_s) == search.values[0]
|
33
26
|
end
|
34
|
-
|
35
|
-
return eval("self[0].#{method}(*#{args})")
|
36
27
|
end
|
37
28
|
end
|
38
29
|
|
39
|
-
def
|
40
|
-
|
41
|
-
|
42
|
-
@agent.keep_alive = false
|
43
|
-
|
44
|
-
if !url.nil?
|
45
|
-
get(url)
|
46
|
-
elsif !doc.nil?
|
47
|
-
@html = doc.to_html
|
48
|
-
@doc = doc
|
49
|
-
table_to_hash
|
50
|
-
else
|
51
|
-
update_params(html)
|
52
|
-
@html = html
|
30
|
+
def method_missing(method, *args)
|
31
|
+
if self == []
|
32
|
+
return eval("Husc.new(doc: nil).#{method}(*#{args})")
|
53
33
|
end
|
54
34
|
|
55
|
-
|
35
|
+
return eval("self[0].#{method}(*#{args})")
|
56
36
|
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def initialize(url = nil, doc: nil, html: nil)
|
40
|
+
## -----*----- コンストラクタ -----*----- ##
|
41
|
+
@agent = Mechanize.new
|
42
|
+
@agent.keep_alive = false
|
57
43
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
44
|
+
if !url.nil?
|
45
|
+
get(url)
|
46
|
+
elsif !doc.nil?
|
47
|
+
@html = doc.to_html
|
48
|
+
@doc = doc
|
49
|
+
table_to_hash
|
50
|
+
else
|
63
51
|
update_params(html)
|
52
|
+
@html = html
|
64
53
|
end
|
65
54
|
|
66
|
-
|
67
|
-
|
68
|
-
#
|
69
|
-
# テキスト,数値など => value(String)を指定
|
70
|
-
# チェックボックス => check(Bool)を指定
|
71
|
-
# ファイルアップロード => file(String)を指定
|
72
|
-
@params << {}
|
73
|
-
opts.each {|key, value| @params[-1][key.to_sym] = value}
|
74
|
-
end
|
55
|
+
@params = []
|
56
|
+
end
|
75
57
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
form = page.form(**opt)
|
84
|
-
end
|
85
|
-
return if form.nil?
|
86
|
-
|
87
|
-
@params.each do |param|
|
88
|
-
# テキスト,数値など
|
89
|
-
if param.include?(:value) && !param.include?(:check)
|
90
|
-
value = param.delete(:value)
|
91
|
-
next if value.nil?
|
92
|
-
form.field_with(**param).value = value unless form.field_with(**param).nil?
|
93
|
-
end
|
58
|
+
def get(url)
|
59
|
+
## -----*----- ページ推移 -----*----- ##
|
60
|
+
@url = url
|
61
|
+
page = @agent.get(@url)
|
62
|
+
html = page.content.toutf8
|
63
|
+
update_params(html)
|
64
|
+
end
|
94
65
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
end
|
66
|
+
def send(opts)
|
67
|
+
## -----*----- フォームデータ指定 -----*----- ##
|
68
|
+
#
|
69
|
+
# テキスト,数値など => value(String)を指定
|
70
|
+
# チェックボックス => check(Bool)を指定
|
71
|
+
# ファイルアップロード => file(String)を指定
|
72
|
+
@params << {}
|
73
|
+
opts.each {|key, value| @params[-1][key.to_sym] = value}
|
74
|
+
end
|
105
75
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
76
|
+
def submit(url = @url, opt)
|
77
|
+
## -----*----- フォーム送信 -----*----- ##
|
78
|
+
@agent.get(url) do |page|
|
79
|
+
# フォーム指定
|
80
|
+
if opt.kind_of?(Integer)
|
81
|
+
form = page.forms[opt]
|
82
|
+
else
|
83
|
+
form = page.form(**opt)
|
84
|
+
end
|
85
|
+
return if form.nil?
|
86
|
+
|
87
|
+
@params.each do |param|
|
88
|
+
# テキスト,数値など
|
89
|
+
if param.include?(:value) && !param.include?(:check)
|
90
|
+
value = param.delete(:value)
|
91
|
+
next if value.nil?
|
92
|
+
form.field_with(**param).value = value unless form.field_with(**param).nil?
|
112
93
|
end
|
113
94
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
95
|
+
# チェックボックス
|
96
|
+
if param.include?(:check)
|
97
|
+
check = param.delete(:check)
|
98
|
+
next if check.nil?
|
99
|
+
if check
|
100
|
+
form.checkbox_with(**param).check unless form.checkbox_with(**param).nil?
|
101
|
+
else
|
102
|
+
form.checkbox_with(**param).uncheck unless form.checkbox_with(**param).nil?
|
103
|
+
end
|
104
|
+
end
|
119
105
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
if elements[0] == nil
|
126
|
-
return CrawlArray.new()
|
127
|
-
else
|
128
|
-
return elements[0]
|
106
|
+
# ファイルアップロード
|
107
|
+
if param.include?(:file)
|
108
|
+
file = param.delete(:file)
|
109
|
+
next if file.nil? || !File.exist?(file)
|
110
|
+
form.file_upload_with(**param).file_name = file unless form.file_upload_with(**param).nil?
|
129
111
|
end
|
130
|
-
else
|
131
|
-
# 複数ノード
|
132
|
-
return elements
|
133
112
|
end
|
113
|
+
|
114
|
+
form = form.submit
|
115
|
+
update_params(form.content.toutf8)
|
134
116
|
end
|
117
|
+
@params = []
|
118
|
+
end
|
135
119
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
else
|
144
|
-
return elements[0]
|
145
|
-
end
|
120
|
+
def xpath(locator, single = false)
|
121
|
+
## -----*----- HTMLからXPath指定で要素取得 -----*----- ##
|
122
|
+
elements = CrawlArray.new(@doc.xpath(locator).map {|el| Husc.new(doc: el)})
|
123
|
+
if single
|
124
|
+
# シングルノード
|
125
|
+
if elements[0] == nil
|
126
|
+
return CrawlArray.new()
|
146
127
|
else
|
147
|
-
|
148
|
-
return elements
|
128
|
+
return elements[0]
|
149
129
|
end
|
130
|
+
else
|
131
|
+
# 複数ノード
|
132
|
+
return elements
|
150
133
|
end
|
134
|
+
end
|
151
135
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
136
|
+
def css(locator, single = false)
|
137
|
+
## -----*----- HTMLからCSSセレクタで要素取得 -----*----- ##
|
138
|
+
elements = CrawlArray.new(@doc.css(locator).map {|el| Husc.new(doc: el)})
|
139
|
+
if single
|
140
|
+
# シングルノード
|
141
|
+
if elements[0] == nil
|
142
|
+
return CrawlArray.new()
|
156
143
|
else
|
157
|
-
|
144
|
+
return elements[0]
|
158
145
|
end
|
146
|
+
else
|
147
|
+
# 複数ノード
|
148
|
+
return elements
|
159
149
|
end
|
150
|
+
end
|
160
151
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
end
|
152
|
+
def inner_text(shaping = true)
|
153
|
+
## -----*----- タグ内の文字列を取得 -----*----- ##
|
154
|
+
if shaping
|
155
|
+
return shaping_string(@doc.inner_text)
|
156
|
+
else
|
157
|
+
@doc.inner_text
|
168
158
|
end
|
159
|
+
end
|
169
160
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
return ret
|
177
|
-
end
|
161
|
+
def text(shaping = true)
|
162
|
+
## -----*----- タグ内の文字列(その他タグ除去)を取得 -----*----- ##
|
163
|
+
if shaping
|
164
|
+
return shaping_string(@doc.text)
|
165
|
+
else
|
166
|
+
@doc.text
|
178
167
|
end
|
168
|
+
end
|
179
169
|
|
180
|
-
|
181
|
-
|
182
|
-
|
170
|
+
def attr(name)
|
171
|
+
## -----*----- ノードの属性情報取得 -----*----- ##
|
172
|
+
ret = @doc.attr(name)
|
173
|
+
if ret.nil?
|
174
|
+
return ''
|
175
|
+
else
|
176
|
+
return ret
|
183
177
|
end
|
178
|
+
end
|
184
179
|
|
185
180
|
|
186
|
-
|
181
|
+
private
|
187
182
|
|
188
183
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
184
|
+
def update_params(html)
|
185
|
+
## -----*----- パラメータを更新 -----*----- ##
|
186
|
+
@html = html
|
187
|
+
@doc = Nokogiri::HTML.parse(@html)
|
188
|
+
table_to_hash
|
189
|
+
end
|
195
190
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
end
|
202
|
-
@doc.css('dl').each do |el|
|
203
|
-
@tables[el.css('dt').inner_text.gsub("\n", "").gsub(" ", "")] = shaping_string(el.css('dd').inner_text)
|
204
|
-
end
|
191
|
+
def table_to_hash
|
192
|
+
## -----*----- テーブル内容をHashに変換 -----*----- ##
|
193
|
+
@tables = {}
|
194
|
+
@doc.css('tr').each do |tr|
|
195
|
+
@tables[tr.css('th').inner_text.gsub("\n", "").gsub(" ", "")] = shaping_string(tr.css('td').inner_text)
|
205
196
|
end
|
206
|
-
|
207
|
-
|
208
|
-
## -----*----- 文字例の整形 -----*----- ##
|
209
|
-
# 余計な改行,空白を全て削除
|
210
|
-
str = str.to_s
|
211
|
-
return str.gsub(" ", ' ').squeeze(' ').gsub("\n \n", "\n").gsub("\n ", "\n").gsub("\r", "\n").squeeze("\n").gsub("\t", "").strip
|
197
|
+
@doc.css('dl').each do |el|
|
198
|
+
@tables[el.css('dt').inner_text.gsub("\n", "").gsub(" ", "")] = shaping_string(el.css('dd').inner_text)
|
212
199
|
end
|
213
200
|
end
|
214
|
-
end
|
215
201
|
|
202
|
+
def shaping_string(str)
|
203
|
+
## -----*----- 文字例の整形 -----*----- ##
|
204
|
+
# 余計な改行,空白を全て削除
|
205
|
+
str = str.to_s
|
206
|
+
return str.gsub(" ", ' ').squeeze(' ').gsub("\n \n", "\n").gsub("\n ", "\n").gsub("\r", "\n").squeeze("\n").gsub("\t", "").strip
|
207
|
+
end
|
208
|
+
end
|