husc 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 70da435ce2b15bb485ce958997a91488cf19f00b7faf5da3f25c92a891028508
4
- data.tar.gz: b363fa06b547c1a5612889739465af7ea46cbeea6b6922aab0d484fa21169ce6
3
+ metadata.gz: a8d3839225b1c0ae71db2371b3858620ac34408c0a209739dbe96e1683437a6f
4
+ data.tar.gz: 4effb7652a7d0121b835e4ac8f8255305c9eb078ee7acaa245dccd19c819843b
5
5
  SHA512:
6
- metadata.gz: 2190d0269954730626eca1d142c8ffd00614bf98fb19d7ba4d595668e944d769636f26c151211a664ba775b2d9f1c3b5594bbd7d5cde05eec7af31e11f82dfff
7
- data.tar.gz: 5f0ea49baf5d2fbf6a707f3b580e95364686c2a3e1439d8c3469c090335790ebeef062176e3d67e3afd7592b23e2256f92f6421a14e4b208c94b4e5a1e737a7f
6
+ metadata.gz: 29d4486ab7cf5de9fb3e757b0b495ac9fbcd88ed389d6696f04a42a8b4e6dfbb841e2435afb3af996d883bcde8ecdca9dde9911be20a7b36cf4009dea4dbdec8
7
+ data.tar.gz: be53729a52309e9b4dbc36cd82673e0d4f20ba8d9631390776a65c5fe623b6395b03636cffa498c4aed1df8cacd767d0ed6435a2f527cd393b68b4b7694c82f1
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- husc (0.1.0)
4
+ husc (0.1.1)
5
5
  mechanize
6
6
  nokogiri
7
7
 
data/README.md CHANGED
@@ -1,5 +1,5 @@
1
- Husc
2
- =======
1
+ husc
2
+ ====
3
3
 
4
4
  A simple crawling utility for Ruby.
5
5
 
@@ -19,7 +19,7 @@ This project enables site crawling and data extraction with xpath and css select
19
19
  require 'husc'
20
20
 
21
21
  url = 'http://www.example.com/'
22
- doc = Husc(url)
22
+ doc = Husc.new(url)
23
23
 
24
24
  # access another url
25
25
  doc.get('another url')
@@ -32,7 +32,6 @@ doc.html
32
32
 
33
33
  # get <table> tags as dict
34
34
  doc.tables
35
- # ex) doc.tables['予約・お問い合わせ'] => 050-5596-6465
36
35
  ```
37
36
 
38
37
  ### Scraping Example
@@ -50,14 +49,14 @@ doc.xpath('//*[@id="top"]/div[1]')
50
49
 
51
50
  # other example
52
51
  doc.css('div').css('a')[2].attr('href') # => string object
53
- doc.css('p').innerText() # => string object
52
+ doc.css('p').inner_text() # => string object
54
53
  # You do not need to specify "[]" to access the first index
55
54
  ```
56
55
 
57
56
  ### Submitting Form Example
58
57
  1. Specify target node's attribute
59
58
  2. Specify value(int or str) / check(bool) / file_name(str)
60
- 3. call submit() with form attribute specified
59
+ 3. Call submit() with form attribute specified
61
60
  ```ruby
62
61
  # login
63
62
  doc.send(id:'id attribute', value:'value to send')
@@ -77,8 +76,6 @@ doc.send(class:'class attribute', value:100)
77
76
  ```
78
77
 
79
78
 
80
-
81
-
82
79
  ## Installation
83
80
  ```sh
84
81
  $ gem install husc
@@ -86,4 +83,4 @@ $ gem install husc
86
83
 
87
84
 
88
85
  ## Contributing
89
- Bug reports and pull requests are welcome on GitHub at [https://github.com/AjxLab/PyCrawl](https://github.com/AjxLab/PyCrawl).
86
+ Bug reports and pull requests are welcome on GitHub at [https://github.com/AjxLab/husc](https://github.com/AjxLab/husc).
data/lib/husc/version.rb CHANGED
@@ -1,3 +1,3 @@
1
- module Husc
2
- VERSION = "0.1.1"
1
+ class Husc
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/husc.rb CHANGED
@@ -5,211 +5,204 @@ require 'net/http'
5
5
  require 'kconv'
6
6
  require 'husc/version'
7
7
 
8
- module Husc
8
+
9
+ class Husc
9
10
  class Error < StandardError; end
10
11
 
11
- class Crawler
12
- attr_reader :url, :html, :tables, :params
12
+ attr_reader :url, :html, :tables, :params
13
13
 
14
- # 特殊配列
15
- class CrawlArray < Array
14
+ # 特殊配列
15
+ class CrawlArray < Array
16
16
 
17
- def find(search)
18
- ## -----*----- 検索 -----*----- ##
19
- self.each do |e|
20
- if search.keys[0].to_s == 'inner'
21
- # innerTextが一致するか
22
- return e if e.innerText == search.values[0]
23
- else
24
- # 属性が一致するか
25
- return e if e.attr(search.keys[0].to_s) == search.values[0]
26
- end
27
- end
28
- end
29
-
30
- def method_missing(method, *args)
31
- if self == []
32
- return eval("Crawler.new(doc: nil).#{method}(*#{args})")
17
+ def find(search)
18
+ ## -----*----- 検索 -----*----- ##
19
+ self.each do |e|
20
+ if search.keys[0].to_s == 'inner'
21
+ # inner_textが一致するか
22
+ return e if e.inner_text == search.values[0]
23
+ else
24
+ # 属性が一致するか
25
+ return e if e.attr(search.keys[0].to_s) == search.values[0]
33
26
  end
34
-
35
- return eval("self[0].#{method}(*#{args})")
36
27
  end
37
28
  end
38
29
 
39
- def initialize(url = nil, doc: nil, html: nil)
40
- ## -----*----- コンストラクタ -----*----- ##
41
- @agent = Mechanize.new
42
- @agent.keep_alive = false
43
-
44
- if !url.nil?
45
- get(url)
46
- elsif !doc.nil?
47
- @html = doc.to_html
48
- @doc = doc
49
- table_to_hash
50
- else
51
- update_params(html)
52
- @html = html
30
+ def method_missing(method, *args)
31
+ if self == []
32
+ return eval("Husc.new(doc: nil).#{method}(*#{args})")
53
33
  end
54
34
 
55
- @params = []
35
+ return eval("self[0].#{method}(*#{args})")
56
36
  end
37
+ end
38
+
39
+ def initialize(url = nil, doc: nil, html: nil)
40
+ ## -----*----- コンストラクタ -----*----- ##
41
+ @agent = Mechanize.new
42
+ @agent.keep_alive = false
57
43
 
58
- def get(url)
59
- ## -----*----- ページ推移 -----*----- ##
60
- @url = url
61
- page = @agent.get(@url)
62
- html = page.content.toutf8
44
+ if !url.nil?
45
+ get(url)
46
+ elsif !doc.nil?
47
+ @html = doc.to_html
48
+ @doc = doc
49
+ table_to_hash
50
+ else
63
51
  update_params(html)
52
+ @html = html
64
53
  end
65
54
 
66
- def send(opts)
67
- ## -----*----- フォームデータ指定 -----*----- ##
68
- #
69
- # テキスト,数値など  => value(String)を指定
70
- # チェックボックス   => check(Bool)を指定
71
- # ファイルアップロード => file(String)を指定
72
- @params << {}
73
- opts.each {|key, value| @params[-1][key.to_sym] = value}
74
- end
55
+ @params = []
56
+ end
75
57
 
76
- def submit(url = @url, opt)
77
- ## -----*----- フォーム送信 -----*----- ##
78
- @agent.get(url) do |page|
79
- # フォーム指定
80
- if opt.kind_of?(Integer)
81
- form = page.forms[opt]
82
- else
83
- form = page.form(**opt)
84
- end
85
- return if form.nil?
86
-
87
- @params.each do |param|
88
- # テキスト,数値など
89
- if param.include?(:value) && !param.include?(:check)
90
- value = param.delete(:value)
91
- next if value.nil?
92
- form.field_with(**param).value = value unless form.field_with(**param).nil?
93
- end
58
+ def get(url)
59
+ ## -----*----- ページ推移 -----*----- ##
60
+ @url = url
61
+ page = @agent.get(@url)
62
+ html = page.content.toutf8
63
+ update_params(html)
64
+ end
94
65
 
95
- # チェックボックス
96
- if param.include?(:check)
97
- check = param.delete(:check)
98
- next if check.nil?
99
- if check
100
- form.checkbox_with(**param).check unless form.checkbox_with(**param).nil?
101
- else
102
- form.checkbox_with(**param).uncheck unless form.checkbox_with(**param).nil?
103
- end
104
- end
66
+ def send(opts)
67
+ ## -----*----- フォームデータ指定 -----*----- ##
68
+ #
69
+ # テキスト,数値など  => value(String)を指定
70
+ # チェックボックス   => check(Bool)を指定
71
+ # ファイルアップロード => file(String)を指定
72
+ @params << {}
73
+ opts.each {|key, value| @params[-1][key.to_sym] = value}
74
+ end
105
75
 
106
- # ファイルアップロード
107
- if param.include?(:file)
108
- file = param.delete(:file)
109
- next if file.nil? || !File.exist?(file)
110
- form.file_upload_with(**param).file_name = file unless form.file_upload_with(**param).nil?
111
- end
76
+ def submit(url = @url, opt)
77
+ ## -----*----- フォーム送信 -----*----- ##
78
+ @agent.get(url) do |page|
79
+ # フォーム指定
80
+ if opt.kind_of?(Integer)
81
+ form = page.forms[opt]
82
+ else
83
+ form = page.form(**opt)
84
+ end
85
+ return if form.nil?
86
+
87
+ @params.each do |param|
88
+ # テキスト,数値など
89
+ if param.include?(:value) && !param.include?(:check)
90
+ value = param.delete(:value)
91
+ next if value.nil?
92
+ form.field_with(**param).value = value unless form.field_with(**param).nil?
112
93
  end
113
94
 
114
- form = form.submit
115
- update_params(form.content.toutf8)
116
- end
117
- @params = []
118
- end
95
+ # チェックボックス
96
+ if param.include?(:check)
97
+ check = param.delete(:check)
98
+ next if check.nil?
99
+ if check
100
+ form.checkbox_with(**param).check unless form.checkbox_with(**param).nil?
101
+ else
102
+ form.checkbox_with(**param).uncheck unless form.checkbox_with(**param).nil?
103
+ end
104
+ end
119
105
 
120
- def xpath(locator, single = false)
121
- ## -----*----- HTMLからXPath指定で要素取得 -----*----- ##
122
- elements = CrawlArray.new(@doc.xpath(locator).map {|el| Crawler.new(doc: el)})
123
- if single
124
- # シングルノード
125
- if elements[0] == nil
126
- return CrawlArray.new()
127
- else
128
- return elements[0]
106
+ # ファイルアップロード
107
+ if param.include?(:file)
108
+ file = param.delete(:file)
109
+ next if file.nil? || !File.exist?(file)
110
+ form.file_upload_with(**param).file_name = file unless form.file_upload_with(**param).nil?
129
111
  end
130
- else
131
- # 複数ノード
132
- return elements
133
112
  end
113
+
114
+ form = form.submit
115
+ update_params(form.content.toutf8)
134
116
  end
117
+ @params = []
118
+ end
135
119
 
136
- def css(locator, single = false)
137
- ## -----*----- HTMLからCSSセレクタで要素取得 -----*----- ##
138
- elements = CrawlArray.new(@doc.css(locator).map {|el| Crawler.new(doc: el)})
139
- if single
140
- # シングルノード
141
- if elements[0] == nil
142
- return CrawlArray.new()
143
- else
144
- return elements[0]
145
- end
120
+ def xpath(locator, single = false)
121
+ ## -----*----- HTMLからXPath指定で要素取得 -----*----- ##
122
+ elements = CrawlArray.new(@doc.xpath(locator).map {|el| Husc.new(doc: el)})
123
+ if single
124
+ # シングルノード
125
+ if elements[0] == nil
126
+ return CrawlArray.new()
146
127
  else
147
- # 複数ノード
148
- return elements
128
+ return elements[0]
149
129
  end
130
+ else
131
+ # 複数ノード
132
+ return elements
150
133
  end
134
+ end
151
135
 
152
- def innerText(shaping = true)
153
- ## -----*----- タグ内の文字列を取得 -----*----- ##
154
- if shaping
155
- return shaping_string(@doc.inner_text)
136
+ def css(locator, single = false)
137
+ ## -----*----- HTMLからCSSセレクタで要素取得 -----*----- ##
138
+ elements = CrawlArray.new(@doc.css(locator).map {|el| Husc.new(doc: el)})
139
+ if single
140
+ # シングルノード
141
+ if elements[0] == nil
142
+ return CrawlArray.new()
156
143
  else
157
- @doc.inner_text
144
+ return elements[0]
158
145
  end
146
+ else
147
+ # 複数ノード
148
+ return elements
159
149
  end
150
+ end
160
151
 
161
- def text(shaping = true)
162
- ## -----*----- タグ内の文字列(その他タグ除去)を取得 -----*----- ##
163
- if shaping
164
- return shaping_string(@doc.text)
165
- else
166
- @doc.text
167
- end
152
+ def inner_text(shaping = true)
153
+ ## -----*----- タグ内の文字列を取得 -----*----- ##
154
+ if shaping
155
+ return shaping_string(@doc.inner_text)
156
+ else
157
+ @doc.inner_text
168
158
  end
159
+ end
169
160
 
170
- def attr(name)
171
- ## -----*----- ノードの属性情報取得 -----*----- ##
172
- ret = @doc.attr(name)
173
- if ret.nil?
174
- return ''
175
- else
176
- return ret
177
- end
161
+ def text(shaping = true)
162
+ ## -----*----- タグ内の文字列(その他タグ除去)を取得 -----*----- ##
163
+ if shaping
164
+ return shaping_string(@doc.text)
165
+ else
166
+ @doc.text
178
167
  end
168
+ end
179
169
 
180
- def url()
181
- ## -----*----- カレントURLの取得 -----*----- ##
182
- return @agent.get(@url).uri.to_s
170
+ def attr(name)
171
+ ## -----*----- ノードの属性情報取得 -----*----- ##
172
+ ret = @doc.attr(name)
173
+ if ret.nil?
174
+ return ''
175
+ else
176
+ return ret
183
177
  end
178
+ end
184
179
 
185
180
 
186
- private
181
+ private
187
182
 
188
183
 
189
- def update_params(html)
190
- ## -----*----- パラメータを更新 -----*----- ##
191
- @html = html
192
- @doc = Nokogiri::HTML.parse(@html)
193
- table_to_hash
194
- end
184
+ def update_params(html)
185
+ ## -----*----- パラメータを更新 -----*----- ##
186
+ @html = html
187
+ @doc = Nokogiri::HTML.parse(@html)
188
+ table_to_hash
189
+ end
195
190
 
196
- def table_to_hash
197
- ## -----*----- テーブル内容をHashに変換 -----*----- ##
198
- @tables = {}
199
- @doc.css('tr').each do |tr|
200
- @tables[tr.css('th').inner_text.gsub("\n", "").gsub(" ", "")] = shaping_string(tr.css('td').inner_text)
201
- end
202
- @doc.css('dl').each do |el|
203
- @tables[el.css('dt').inner_text.gsub("\n", "").gsub(" ", "")] = shaping_string(el.css('dd').inner_text)
204
- end
191
+ def table_to_hash
192
+ ## -----*----- テーブル内容をHashに変換 -----*----- ##
193
+ @tables = {}
194
+ @doc.css('tr').each do |tr|
195
+ @tables[tr.css('th').inner_text.gsub("\n", "").gsub(" ", "")] = shaping_string(tr.css('td').inner_text)
205
196
  end
206
-
207
- def shaping_string(str)
208
- ## -----*----- 文字例の整形 -----*----- ##
209
- # 余計な改行,空白を全て削除
210
- str = str.to_s
211
- return str.gsub(" ", ' ').squeeze(' ').gsub("\n \n", "\n").gsub("\n ", "\n").gsub("\r", "\n").squeeze("\n").gsub("\t", "").strip
197
+ @doc.css('dl').each do |el|
198
+ @tables[el.css('dt').inner_text.gsub("\n", "").gsub(" ", "")] = shaping_string(el.css('dd').inner_text)
212
199
  end
213
200
  end
214
- end
215
201
 
202
+ def shaping_string(str)
203
+ ## -----*----- 文字例の整形 -----*----- ##
204
+ # 余計な改行,空白を全て削除
205
+ str = str.to_s
206
+ return str.gsub(" ", ' ').squeeze(' ').gsub("\n \n", "\n").gsub("\n ", "\n").gsub("\r", "\n").squeeze("\n").gsub("\t", "").strip
207
+ end
208
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: husc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tatsuya Abe