hertools 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/hertools/version.rb +1 -1
- data/lib/hertools/website_parser.rb +24 -19
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '05092964905a17f8eeefae0d133e2ea5ba85db714759ccc42d173b4ec8a5dbbb'
|
4
|
+
data.tar.gz: 1f0a3befe399d98d0c0cccb543a92e6b1674675e898a7c4a437ae7b91e9998c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 43a337be463c80fd54caba8f05feb63f638f2208a671c45827ce331114f1e40209a68d353feebb302115f91721201bd818b783d628a77b7c1d208725922a4bcc
|
7
|
+
data.tar.gz: 7d0824dc195b48c560009d1a1dba29fc2a479b2cf6a9d8327d769005109696fce926f8184370d6c32ced34adaeb24fdc20051ef84fc63bcdf18d3b9df006f035
|
data/lib/hertools/version.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require 'base64'
|
4
4
|
require 'open-uri'
|
5
|
+
require 'net/http'
|
5
6
|
require 'httparty'
|
6
7
|
require 'htmlentities'
|
7
8
|
require 'nokogiri'
|
@@ -13,7 +14,7 @@ module Hertools
|
|
13
14
|
# Arguments
|
14
15
|
# url: the url of a webpage
|
15
16
|
# options:
|
16
|
-
# html_parser: %w[httparty nokogiri]
|
17
|
+
# html_parser: %w[httparty nokogiri net_http]
|
17
18
|
# root_path: existing file directory
|
18
19
|
def crawl_title_and_favicon_file(url, options = {})
|
19
20
|
puts '>>> Parsing the arguments <<<'
|
@@ -27,10 +28,8 @@ module Hertools
|
|
27
28
|
end
|
28
29
|
|
29
30
|
puts '>>> Analysing the http response <<<'
|
30
|
-
|
31
|
-
|
32
|
-
res = response.body
|
33
|
-
else
|
31
|
+
case @html_parser
|
32
|
+
when 'nokogiri'
|
34
33
|
response = HTTParty.head(@url)
|
35
34
|
res = begin
|
36
35
|
Nokogiri::HTML(URI.open(url), nil, 'UTF-8')
|
@@ -38,6 +37,12 @@ module Hertools
|
|
38
37
|
puts e
|
39
38
|
nil
|
40
39
|
end
|
40
|
+
when 'httparty'
|
41
|
+
response = HTTParty.get(@url)
|
42
|
+
res = response.body
|
43
|
+
else
|
44
|
+
response = Net::HTTP.get_response(URI(@url))
|
45
|
+
res = response.body.force_encoding("utf-8")
|
41
46
|
end
|
42
47
|
puts "HttpCode: #{response.code}"
|
43
48
|
|
@@ -47,27 +52,27 @@ module Hertools
|
|
47
52
|
@favicon_url = "#{@index_url}/favicon.ico"
|
48
53
|
puts "Use the default favicon url: #{@favicon_url}."
|
49
54
|
else
|
50
|
-
@title = if
|
51
|
-
res[%r{<title>\n*(.*)\n*</title>}, 1].to_s
|
52
|
-
else
|
55
|
+
@title = if nokogiri?
|
53
56
|
res.xpath('//head/title')[0]&.content.to_s
|
57
|
+
else
|
58
|
+
res[%r{<title>\n*(.*)\n*</title>}, 1].to_s
|
54
59
|
end
|
55
60
|
if @title.empty?
|
56
61
|
puts 'Not found the title!'
|
57
62
|
puts 'Use the domain name as the title.'
|
58
63
|
@title = @domain_name
|
59
64
|
end
|
60
|
-
|
65
|
+
unless nokogiri?
|
61
66
|
coder = HTMLEntities.new
|
62
67
|
@title = coder.decode(@title)
|
63
68
|
end
|
64
69
|
puts "Title: #{@title}"
|
65
70
|
|
66
|
-
@favicon_url = if
|
67
|
-
res[/<link rel="icon".*href="([^"]+)/, 1].to_s
|
68
|
-
else
|
71
|
+
@favicon_url = if nokogiri?
|
69
72
|
favicon_links = res.xpath('//head/link[@rel="icon"]')
|
70
73
|
favicon_links.empty? ? '' : favicon_links[0][:href].to_s
|
74
|
+
else
|
75
|
+
res[/<link rel="icon".*href="([^"]+)/, 1].to_s
|
71
76
|
end
|
72
77
|
if @favicon_url.empty?
|
73
78
|
puts 'Not found the favicon url!'
|
@@ -147,10 +152,10 @@ module Hertools
|
|
147
152
|
|
148
153
|
def parse_options(options)
|
149
154
|
options = Hash(options)
|
150
|
-
html_parser = options.fetch(:html_parser) { '
|
155
|
+
html_parser = options.fetch(:html_parser) { 'net_http' }
|
151
156
|
@old_html_parser = @html_parser
|
152
|
-
@html_parser = %w[httparty nokogiri].include?(html_parser) ? html_parser : '
|
153
|
-
|
157
|
+
@html_parser = %w[httparty nokogiri].include?(html_parser) ? html_parser : 'net_http'
|
158
|
+
rejudge_nokogiri if @old_html_parser != @html_parser
|
154
159
|
puts "HtmlParser: #{@html_parser}"
|
155
160
|
root_path = options.fetch(:root_path) { Dir.pwd }
|
156
161
|
@root_path = (File.directory?(root_path) ? root_path : Dir.pwd).chomp('/')
|
@@ -161,12 +166,12 @@ module Hertools
|
|
161
166
|
false
|
162
167
|
end
|
163
168
|
|
164
|
-
def
|
165
|
-
@
|
169
|
+
def nokogiri?
|
170
|
+
@judge_nokogiri ||= @html_parser == 'nokogiri'
|
166
171
|
end
|
167
172
|
|
168
|
-
def
|
169
|
-
@
|
173
|
+
def rejudge_nokogiri
|
174
|
+
@judge_nokogiri = nil
|
170
175
|
end
|
171
176
|
end
|
172
177
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hertools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- WuDi
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-09-
|
11
|
+
date: 2020-09-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: httparty
|