husc 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +6 -9
- data/lib/husc/version.rb +2 -2
- data/lib/husc.rb +154 -161
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a8d3839225b1c0ae71db2371b3858620ac34408c0a209739dbe96e1683437a6f
|
4
|
+
data.tar.gz: 4effb7652a7d0121b835e4ac8f8255305c9eb078ee7acaa245dccd19c819843b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 29d4486ab7cf5de9fb3e757b0b495ac9fbcd88ed389d6696f04a42a8b4e6dfbb841e2435afb3af996d883bcde8ecdca9dde9911be20a7b36cf4009dea4dbdec8
|
7
|
+
data.tar.gz: be53729a52309e9b4dbc36cd82673e0d4f20ba8d9631390776a65c5fe623b6395b03636cffa498c4aed1df8cacd767d0ed6435a2f527cd393b68b4b7694c82f1
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
husc
|
2
|
+
====
|
3
3
|
|
4
4
|
A simple crawling utility for Ruby.
|
5
5
|
|
@@ -19,7 +19,7 @@ This project enables site crawling and data extraction with xpath and css select
|
|
19
19
|
require 'husc'
|
20
20
|
|
21
21
|
url = 'http://www.example.com/'
|
22
|
-
doc = Husc(url)
|
22
|
+
doc = Husc.new(url)
|
23
23
|
|
24
24
|
# access another url
|
25
25
|
doc.get('another url')
|
@@ -32,7 +32,6 @@ doc.html
|
|
32
32
|
|
33
33
|
# get <table> tags as dict
|
34
34
|
doc.tables
|
35
|
-
# ex) doc.tables['予約・お問い合わせ'] => 050-5596-6465
|
36
35
|
```
|
37
36
|
|
38
37
|
### Scraping Example
|
@@ -50,14 +49,14 @@ doc.xpath('//*[@id="top"]/div[1]')
|
|
50
49
|
|
51
50
|
# other example
|
52
51
|
doc.css('div').css('a')[2].attr('href') # => string object
|
53
|
-
doc.css('p').
|
52
|
+
doc.css('p').inner_text() # => string object
|
54
53
|
# You do not need to specify "[]" to access the first index
|
55
54
|
```
|
56
55
|
|
57
56
|
### Submitting Form Example
|
58
57
|
1. Specify target node's attribute
|
59
58
|
2. Specify value(int or str) / check(bool) / file_name(str)
|
60
|
-
3.
|
59
|
+
3. Call submit() with form attribute specified
|
61
60
|
```ruby
|
62
61
|
# login
|
63
62
|
doc.send(id:'id attribute', value:'value to send')
|
@@ -77,8 +76,6 @@ doc.send(class:'class attribute', value:100)
|
|
77
76
|
```
|
78
77
|
|
79
78
|
|
80
|
-
|
81
|
-
|
82
79
|
## Installation
|
83
80
|
```sh
|
84
81
|
$ gem install husc
|
@@ -86,4 +83,4 @@ $ gem install husc
|
|
86
83
|
|
87
84
|
|
88
85
|
## Contributing
|
89
|
-
Bug reports and pull requests are welcome on GitHub at [https://github.com/AjxLab/
|
86
|
+
Bug reports and pull requests are welcome on GitHub at [https://github.com/AjxLab/husc](https://github.com/AjxLab/husc).
|
data/lib/husc/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION = "0.
|
1
|
+
class Husc
|
2
|
+
VERSION = "0.2.0"
|
3
3
|
end
|
data/lib/husc.rb
CHANGED
@@ -5,211 +5,204 @@ require 'net/http'
|
|
5
5
|
require 'kconv'
|
6
6
|
require 'husc/version'
|
7
7
|
|
8
|
-
|
8
|
+
|
9
|
+
class Husc
|
9
10
|
class Error < StandardError; end
|
10
11
|
|
11
|
-
|
12
|
-
attr_reader :url, :html, :tables, :params
|
12
|
+
attr_reader :url, :html, :tables, :params
|
13
13
|
|
14
|
-
|
15
|
-
|
14
|
+
# 特殊配列
|
15
|
+
class CrawlArray < Array
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
def method_missing(method, *args)
|
31
|
-
if self == []
|
32
|
-
return eval("Crawler.new(doc: nil).#{method}(*#{args})")
|
17
|
+
def find(search)
|
18
|
+
## -----*----- 検索 -----*----- ##
|
19
|
+
self.each do |e|
|
20
|
+
if search.keys[0].to_s == 'inner'
|
21
|
+
# inner_textが一致するか
|
22
|
+
return e if e.inner_text == search.values[0]
|
23
|
+
else
|
24
|
+
# 属性が一致するか
|
25
|
+
return e if e.attr(search.keys[0].to_s) == search.values[0]
|
33
26
|
end
|
34
|
-
|
35
|
-
return eval("self[0].#{method}(*#{args})")
|
36
27
|
end
|
37
28
|
end
|
38
29
|
|
39
|
-
def
|
40
|
-
|
41
|
-
|
42
|
-
@agent.keep_alive = false
|
43
|
-
|
44
|
-
if !url.nil?
|
45
|
-
get(url)
|
46
|
-
elsif !doc.nil?
|
47
|
-
@html = doc.to_html
|
48
|
-
@doc = doc
|
49
|
-
table_to_hash
|
50
|
-
else
|
51
|
-
update_params(html)
|
52
|
-
@html = html
|
30
|
+
def method_missing(method, *args)
|
31
|
+
if self == []
|
32
|
+
return eval("Husc.new(doc: nil).#{method}(*#{args})")
|
53
33
|
end
|
54
34
|
|
55
|
-
|
35
|
+
return eval("self[0].#{method}(*#{args})")
|
56
36
|
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def initialize(url = nil, doc: nil, html: nil)
|
40
|
+
## -----*----- コンストラクタ -----*----- ##
|
41
|
+
@agent = Mechanize.new
|
42
|
+
@agent.keep_alive = false
|
57
43
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
44
|
+
if !url.nil?
|
45
|
+
get(url)
|
46
|
+
elsif !doc.nil?
|
47
|
+
@html = doc.to_html
|
48
|
+
@doc = doc
|
49
|
+
table_to_hash
|
50
|
+
else
|
63
51
|
update_params(html)
|
52
|
+
@html = html
|
64
53
|
end
|
65
54
|
|
66
|
-
|
67
|
-
|
68
|
-
#
|
69
|
-
# テキスト,数値など => value(String)を指定
|
70
|
-
# チェックボックス => check(Bool)を指定
|
71
|
-
# ファイルアップロード => file(String)を指定
|
72
|
-
@params << {}
|
73
|
-
opts.each {|key, value| @params[-1][key.to_sym] = value}
|
74
|
-
end
|
55
|
+
@params = []
|
56
|
+
end
|
75
57
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
form = page.form(**opt)
|
84
|
-
end
|
85
|
-
return if form.nil?
|
86
|
-
|
87
|
-
@params.each do |param|
|
88
|
-
# テキスト,数値など
|
89
|
-
if param.include?(:value) && !param.include?(:check)
|
90
|
-
value = param.delete(:value)
|
91
|
-
next if value.nil?
|
92
|
-
form.field_with(**param).value = value unless form.field_with(**param).nil?
|
93
|
-
end
|
58
|
+
def get(url)
|
59
|
+
## -----*----- ページ推移 -----*----- ##
|
60
|
+
@url = url
|
61
|
+
page = @agent.get(@url)
|
62
|
+
html = page.content.toutf8
|
63
|
+
update_params(html)
|
64
|
+
end
|
94
65
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
end
|
66
|
+
def send(opts)
|
67
|
+
## -----*----- フォームデータ指定 -----*----- ##
|
68
|
+
#
|
69
|
+
# テキスト,数値など => value(String)を指定
|
70
|
+
# チェックボックス => check(Bool)を指定
|
71
|
+
# ファイルアップロード => file(String)を指定
|
72
|
+
@params << {}
|
73
|
+
opts.each {|key, value| @params[-1][key.to_sym] = value}
|
74
|
+
end
|
105
75
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
76
|
+
def submit(url = @url, opt)
|
77
|
+
## -----*----- フォーム送信 -----*----- ##
|
78
|
+
@agent.get(url) do |page|
|
79
|
+
# フォーム指定
|
80
|
+
if opt.kind_of?(Integer)
|
81
|
+
form = page.forms[opt]
|
82
|
+
else
|
83
|
+
form = page.form(**opt)
|
84
|
+
end
|
85
|
+
return if form.nil?
|
86
|
+
|
87
|
+
@params.each do |param|
|
88
|
+
# テキスト,数値など
|
89
|
+
if param.include?(:value) && !param.include?(:check)
|
90
|
+
value = param.delete(:value)
|
91
|
+
next if value.nil?
|
92
|
+
form.field_with(**param).value = value unless form.field_with(**param).nil?
|
112
93
|
end
|
113
94
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
95
|
+
# チェックボックス
|
96
|
+
if param.include?(:check)
|
97
|
+
check = param.delete(:check)
|
98
|
+
next if check.nil?
|
99
|
+
if check
|
100
|
+
form.checkbox_with(**param).check unless form.checkbox_with(**param).nil?
|
101
|
+
else
|
102
|
+
form.checkbox_with(**param).uncheck unless form.checkbox_with(**param).nil?
|
103
|
+
end
|
104
|
+
end
|
119
105
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
if elements[0] == nil
|
126
|
-
return CrawlArray.new()
|
127
|
-
else
|
128
|
-
return elements[0]
|
106
|
+
# ファイルアップロード
|
107
|
+
if param.include?(:file)
|
108
|
+
file = param.delete(:file)
|
109
|
+
next if file.nil? || !File.exist?(file)
|
110
|
+
form.file_upload_with(**param).file_name = file unless form.file_upload_with(**param).nil?
|
129
111
|
end
|
130
|
-
else
|
131
|
-
# 複数ノード
|
132
|
-
return elements
|
133
112
|
end
|
113
|
+
|
114
|
+
form = form.submit
|
115
|
+
update_params(form.content.toutf8)
|
134
116
|
end
|
117
|
+
@params = []
|
118
|
+
end
|
135
119
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
else
|
144
|
-
return elements[0]
|
145
|
-
end
|
120
|
+
def xpath(locator, single = false)
|
121
|
+
## -----*----- HTMLからXPath指定で要素取得 -----*----- ##
|
122
|
+
elements = CrawlArray.new(@doc.xpath(locator).map {|el| Husc.new(doc: el)})
|
123
|
+
if single
|
124
|
+
# シングルノード
|
125
|
+
if elements[0] == nil
|
126
|
+
return CrawlArray.new()
|
146
127
|
else
|
147
|
-
|
148
|
-
return elements
|
128
|
+
return elements[0]
|
149
129
|
end
|
130
|
+
else
|
131
|
+
# 複数ノード
|
132
|
+
return elements
|
150
133
|
end
|
134
|
+
end
|
151
135
|
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
136
|
+
def css(locator, single = false)
|
137
|
+
## -----*----- HTMLからCSSセレクタで要素取得 -----*----- ##
|
138
|
+
elements = CrawlArray.new(@doc.css(locator).map {|el| Husc.new(doc: el)})
|
139
|
+
if single
|
140
|
+
# シングルノード
|
141
|
+
if elements[0] == nil
|
142
|
+
return CrawlArray.new()
|
156
143
|
else
|
157
|
-
|
144
|
+
return elements[0]
|
158
145
|
end
|
146
|
+
else
|
147
|
+
# 複数ノード
|
148
|
+
return elements
|
159
149
|
end
|
150
|
+
end
|
160
151
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
end
|
152
|
+
def inner_text(shaping = true)
|
153
|
+
## -----*----- タグ内の文字列を取得 -----*----- ##
|
154
|
+
if shaping
|
155
|
+
return shaping_string(@doc.inner_text)
|
156
|
+
else
|
157
|
+
@doc.inner_text
|
168
158
|
end
|
159
|
+
end
|
169
160
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
return ret
|
177
|
-
end
|
161
|
+
def text(shaping = true)
|
162
|
+
## -----*----- タグ内の文字列(その他タグ除去)を取得 -----*----- ##
|
163
|
+
if shaping
|
164
|
+
return shaping_string(@doc.text)
|
165
|
+
else
|
166
|
+
@doc.text
|
178
167
|
end
|
168
|
+
end
|
179
169
|
|
180
|
-
|
181
|
-
|
182
|
-
|
170
|
+
def attr(name)
|
171
|
+
## -----*----- ノードの属性情報取得 -----*----- ##
|
172
|
+
ret = @doc.attr(name)
|
173
|
+
if ret.nil?
|
174
|
+
return ''
|
175
|
+
else
|
176
|
+
return ret
|
183
177
|
end
|
178
|
+
end
|
184
179
|
|
185
180
|
|
186
|
-
|
181
|
+
private
|
187
182
|
|
188
183
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
184
|
+
def update_params(html)
|
185
|
+
## -----*----- パラメータを更新 -----*----- ##
|
186
|
+
@html = html
|
187
|
+
@doc = Nokogiri::HTML.parse(@html)
|
188
|
+
table_to_hash
|
189
|
+
end
|
195
190
|
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
end
|
202
|
-
@doc.css('dl').each do |el|
|
203
|
-
@tables[el.css('dt').inner_text.gsub("\n", "").gsub(" ", "")] = shaping_string(el.css('dd').inner_text)
|
204
|
-
end
|
191
|
+
def table_to_hash
|
192
|
+
## -----*----- テーブル内容をHashに変換 -----*----- ##
|
193
|
+
@tables = {}
|
194
|
+
@doc.css('tr').each do |tr|
|
195
|
+
@tables[tr.css('th').inner_text.gsub("\n", "").gsub(" ", "")] = shaping_string(tr.css('td').inner_text)
|
205
196
|
end
|
206
|
-
|
207
|
-
|
208
|
-
## -----*----- 文字例の整形 -----*----- ##
|
209
|
-
# 余計な改行,空白を全て削除
|
210
|
-
str = str.to_s
|
211
|
-
return str.gsub(" ", ' ').squeeze(' ').gsub("\n \n", "\n").gsub("\n ", "\n").gsub("\r", "\n").squeeze("\n").gsub("\t", "").strip
|
197
|
+
@doc.css('dl').each do |el|
|
198
|
+
@tables[el.css('dt').inner_text.gsub("\n", "").gsub(" ", "")] = shaping_string(el.css('dd').inner_text)
|
212
199
|
end
|
213
200
|
end
|
214
|
-
end
|
215
201
|
|
202
|
+
def shaping_string(str)
|
203
|
+
## -----*----- 文字例の整形 -----*----- ##
|
204
|
+
# 余計な改行,空白を全て削除
|
205
|
+
str = str.to_s
|
206
|
+
return str.gsub(" ", ' ').squeeze(' ').gsub("\n \n", "\n").gsub("\n ", "\n").gsub("\r", "\n").squeeze("\n").gsub("\t", "").strip
|
207
|
+
end
|
208
|
+
end
|