web_stat 0.2.6 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1f508dae86c4163037aa8ca2dd299be2f98ad01a2b4e61cff8c20b7562cb91e4
4
- data.tar.gz: 2a4e70f10396243d5917decf75714302ded5959cebb517fca6d27bc3c45db5eb
3
+ metadata.gz: bcfaeb202076ea30cae6205877d0b0ad9060eb84116b84ef7bd3580cc4349aac
4
+ data.tar.gz: 98e586440c8f3aed29e38a003f6d1afc96b26b2ae4c21e43d47e1dfaf6aaa16a
5
5
  SHA512:
6
- metadata.gz: c6e4b4066d42003bb4e07ac9bfd4184e73dfea3a3d4e3d7a5b239c227f009a631e01cfb7b5d17c0240453d5f943b9e1f49c846d6d2167dd9ad2122372e709fe3
7
- data.tar.gz: 597cef76252fdcc77beefcc5d24e68625c93fa5293bed0d7a40f26847b0db6a0b981c83a8e6f3beafee93778c1ff2bf16899a29c6df1866bf6b929b5d29c2975
6
+ metadata.gz: dfbe6264256f08550ebb42244d92bd81c976dd5dd72a6a4dabffdf6a1366a8f010e8a20527d0bbe7a8334cdb028e86ecae0c63d6cc4368741abe815f1fcb3092
7
+ data.tar.gz: 9098d904f26dfdfe14c87352cb47ca3f0333f5424ef26d2cd8232c088c2cf8d7dcbeb07012a9294a5c4e4949b51fe71e94d0995465400d87cf7c037fd09ba978
data/Gemfile CHANGED
@@ -1,6 +1,6 @@
1
1
  source "https://rubygems.org"
2
2
 
3
- gem "final_redirect_url", :git => "git@github.com:yubele/final_redirect_url.git"
3
+ gem "final_redirect_url", :git => "git@github.com:yubele/final_redirect_url"
4
4
 
5
5
  # Specify your gem's dependencies in web_stat.gemspec
6
6
  gemspec
@@ -1,13 +1,13 @@
1
1
  GIT
2
- remote: git@github.com:yubele/final_redirect_url.git
3
- revision: a8e8fb256f044606313d9726278516a0f24449d4
2
+ remote: git@github.com:yubele/final_redirect_url
3
+ revision: 45df878ec9495ebbfa06dc0a60cc5043c2519e16
4
4
  specs:
5
- final_redirect_url (0.1.0)
5
+ final_redirect_url (0.1.1)
6
6
 
7
7
  PATH
8
8
  remote: .
9
9
  specs:
10
- web_stat (0.2.6)
10
+ web_stat (0.2.11)
11
11
  bundler (>= 2.0.2)
12
12
  cld (>= 0.8.0)
13
13
  mechanize (>= 2.7)
@@ -27,13 +27,13 @@ GEM
27
27
  connection_pool (2.2.2)
28
28
  crack (0.4.3)
29
29
  safe_yaml (~> 1.0.0)
30
- crass (1.0.5)
30
+ crass (1.0.6)
31
31
  diff-lcs (1.3)
32
32
  domain_name (0.5.20190701)
33
33
  unf (>= 0.0.5, < 1.0.0)
34
- ffi (1.11.3)
34
+ ffi (1.12.2)
35
35
  guess_html_encoding (0.0.11)
36
- hashdiff (1.0.0)
36
+ hashdiff (1.0.1)
37
37
  http-cookie (1.0.3)
38
38
  domain_name (~> 0.5)
39
39
  mechanize (2.7.6)
@@ -45,39 +45,39 @@ GEM
45
45
  nokogiri (~> 1.6)
46
46
  ntlm-http (~> 0.1, >= 0.1.1)
47
47
  webrobots (>= 0.0.9, < 0.2)
48
- method_source (0.9.2)
48
+ method_source (1.0.0)
49
49
  mime-types (3.3.1)
50
50
  mime-types-data (~> 3.2015)
51
- mime-types-data (3.2019.1009)
51
+ mime-types-data (3.2020.0425)
52
52
  mini_portile2 (2.4.0)
53
- natto (1.1.2)
53
+ natto (1.2.0)
54
54
  ffi (>= 1.9.0)
55
55
  net-http-digest_auth (1.4.1)
56
- net-http-persistent (3.1.0)
56
+ net-http-persistent (4.0.0)
57
57
  connection_pool (~> 2.2)
58
- nokogiri (1.10.7)
58
+ nokogiri (1.10.9)
59
59
  mini_portile2 (~> 2.4.0)
60
60
  nokogumbo (2.0.2)
61
61
  nokogiri (~> 1.8, >= 1.8.4)
62
62
  ntlm-http (0.1.1)
63
- pry (0.12.2)
64
- coderay (~> 1.1.0)
65
- method_source (~> 0.9.0)
66
- public_suffix (4.0.2)
63
+ pry (0.13.1)
64
+ coderay (~> 1.1)
65
+ method_source (~> 1.0)
66
+ public_suffix (4.0.4)
67
67
  rake (13.0.1)
68
68
  rspec (3.9.0)
69
69
  rspec-core (~> 3.9.0)
70
70
  rspec-expectations (~> 3.9.0)
71
71
  rspec-mocks (~> 3.9.0)
72
- rspec-core (3.9.1)
73
- rspec-support (~> 3.9.1)
74
- rspec-expectations (3.9.0)
72
+ rspec-core (3.9.2)
73
+ rspec-support (~> 3.9.3)
74
+ rspec-expectations (3.9.1)
75
75
  diff-lcs (>= 1.2.0, < 2.0)
76
76
  rspec-support (~> 3.9.0)
77
77
  rspec-mocks (3.9.1)
78
78
  diff-lcs (>= 1.2.0, < 2.0)
79
79
  rspec-support (~> 3.9.0)
80
- rspec-support (3.9.2)
80
+ rspec-support (3.9.3)
81
81
  ruby-readability (0.7.0)
82
82
  guess_html_encoding (>= 0.0.4)
83
83
  nokogiri (>= 1.6.0)
@@ -88,8 +88,8 @@ GEM
88
88
  nokogumbo (~> 2.0)
89
89
  unf (0.1.4)
90
90
  unf_ext
91
- unf_ext (0.0.7.6)
92
- webmock (3.7.6)
91
+ unf_ext (0.0.7.7)
92
+ webmock (3.8.3)
93
93
  addressable (>= 2.3.6)
94
94
  crack (>= 0.3.2)
95
95
  hashdiff (>= 0.4.0, < 2.0.0)
@@ -107,4 +107,4 @@ DEPENDENCIES
107
107
  webmock (>= 3.6.0)
108
108
 
109
109
  BUNDLED WITH
110
- 2.1.2
110
+ 2.1.4
data/README.md CHANGED
@@ -1,3 +1,10 @@
1
+ # !!!!! Precautions when using with Rails !!!!!
2
+
3
+ Write this line your Gemfile.
4
+ ```
5
+ gem "final_redirect_url", :git => "git@github.com:yubele/final_redirect_url"
6
+ ```
7
+
1
8
  # WebStat
2
9
 
3
10
  Fetch the web pages and stat.
@@ -18,6 +25,10 @@ Fetch the web pages and stat.
18
25
  - "rake", "~> 10.0"
19
26
  - "rspec", "~> 3.0"
20
27
 
28
+ ### Install mecab
29
+
30
+ $ sudo apt install mecab-ipadic-utf8 libmecab
31
+
21
32
  ## Installation
22
33
 
23
34
  Add this line to your application's Gemfile:
@@ -21,24 +21,26 @@ require "web_stat/fetch/fetch_as_web"
21
21
  module WebStat
22
22
  class << self
23
23
  # Get web page's stat by url
24
- def stat_by_web(url)
24
+ # @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
25
+ def stat_by_web(url, userdics: nil)
25
26
  web_stat = WebStat::FetchAsWeb.new(url)
26
- web_stat.stat
27
+ web_stat.stat(userdics: userdics)
27
28
  end
28
29
 
29
30
  # Get web page's stat by url
30
31
  # @param String url
31
- def stat_by_url(url)
32
- stat_by_web(url)
32
+ def stat_by_url(url, userdics: nil)
33
+ stat_by_web(url, userdics: userdics)
33
34
  end
34
35
 
35
36
  # Get web page's stat by html
36
37
  # @param String html
37
38
  # @param [String] url
38
- def stat_by_html(html, url=nil)
39
+ # @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
40
+ def stat_by_html(html, url=nil, userdics: nil)
39
41
  web_stat = WebStat::FetchAsHtml.new(html)
40
42
  web_stat.url = url unless url.nil?
41
- web_stat.stat
43
+ web_stat.stat(userdics: userdics)
42
44
  end
43
45
  end
44
46
  end
@@ -1,6 +1,6 @@
1
1
  module WebStat
2
2
  class Fetch
3
- attr_accessor :url, :html, :nokogiri, :userdic
3
+ attr_accessor :url, :html, :nokogiri, :userdic, :status
4
4
 
5
5
  # Get title
6
6
  # @return [String] title
@@ -13,7 +13,11 @@ module WebStat
13
13
  rescue
14
14
  title = @nokogiri.title
15
15
  end
16
- title.strip
16
+ if title.nil?
17
+ "No Title"
18
+ else
19
+ title.strip
20
+ end
17
21
  end
18
22
 
19
23
  # Get name of domain
@@ -23,7 +27,11 @@ module WebStat
23
27
  rescue
24
28
  site_name = @nokogiri.title
25
29
  end
26
- site_name.strip
30
+ if site_name.nil?
31
+ "No Sitename"
32
+ else
33
+ site_name.strip
34
+ end
27
35
  end
28
36
  []
29
37
  # Get main section
@@ -41,7 +49,7 @@ module WebStat
41
49
  break
42
50
  end
43
51
  end
44
- if path.match(/^\//)
52
+ if ! path.nil? && path.match(/^\//)
45
53
  "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}"
46
54
  else
47
55
  path
@@ -53,9 +61,13 @@ module WebStat
53
61
  def save_local_path(url)
54
62
  return nil if url.nil?
55
63
  tmp_file = "/tmp/#{Digest::SHA1.hexdigest(url)}"
56
- URI.open(original_url(url)) do |remote_file|
57
- File.open(tmp_file, "w+b") do |_file|
58
- _file.puts(remote_file.read)
64
+ agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
65
+ image = agent.get(url)
66
+ File.open(tmp_file, "w+b") do |_file|
67
+ if image.class == Mechanize::File
68
+ _file.puts(image.body)
69
+ else
70
+ _file.puts(image.body_io.read)
59
71
  end
60
72
  end
61
73
  tmp_file
@@ -63,23 +75,44 @@ module WebStat
63
75
 
64
76
  # Get url
65
77
  # @param [String] url
78
+ # @param [String] body
66
79
  def get_url(url)
67
80
  agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
68
81
  # Enable to read Robots.txt
69
82
  agent.robots = true
70
- document = agent.get(url, [], nil, { 'Accept-Language' => 'ja'})
71
- document.body.encode('UTF-8', document.encoding)
83
+ begin
84
+ document = agent.get(url, [], nil, { 'Accept-Language' => 'ja'})
85
+ if document.class == Mechanize::File
86
+ body = document.body
87
+ else
88
+ body = document.body.encode('UTF-8', document.encoding)
89
+ end
90
+ @status = document.code
91
+ rescue Mechanize::ResponseCodeError => e
92
+ body = e.page.body
93
+ @status = e.page.code
94
+ end
95
+ body
72
96
  end
73
97
 
74
98
  # Get the informations of @url
75
- def stat
76
- clean_content = content.scrub('').gsub(/[\s ]/, "")
77
- tag = WebStat::Tag.new(content, userdic: WebStat::Configure.get["userdic"])
99
+ # @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
100
+ def stat(userdics: nil)
101
+ clean_content = content.scrub('').gsub(/[\n\t\r ]/, "").gsub(/\s{2,}/, "\s")
102
+ language_code = CLD.detect_language(clean_content)[:code]
103
+ if userdics && userdics.has_key?(language_code) && File.exists?(userdics[language_code])
104
+ tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics[language_code])
105
+ elsif userdics && userdics.has_key?("other") && File.exists?(userdics["other"])
106
+ tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics["other"])
107
+ else
108
+ tag = WebStat::Tag.new("#{title} #{content}", userdic: WebStat::Configure.get["userdic"])
109
+ end
78
110
  {
79
111
  title: title,
80
112
  site_name: site_name,
81
113
  content: clean_content,
82
- language_code: CLD.detect_language(clean_content)[:code],
114
+ language_code: language_code,
115
+ status: @status,
83
116
  url: @url,
84
117
  eyecatch_image_path: save_local_path(eyecatch_image_path),
85
118
  tags: tag.nouns
@@ -91,8 +124,9 @@ module WebStat
91
124
  # Get original url
92
125
  # @param [String] url
93
126
  def original_url(url)
94
- if url.match(/^http/)
95
- FinalRedirectUrl.final_redirect_url(url)
127
+ last_url = FinalRedirectUrl.final_redirect_url(url)
128
+ unless last_url.nil? || last_url.scrub('').empty?
129
+ last_url
96
130
  else
97
131
  url
98
132
  end
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.2.6"
3
- end
2
+ VERSION = "0.2.11"
3
+ end
@@ -59,9 +59,10 @@ end
59
59
 
60
60
  # Set webmock
61
61
  WebStatTestHelper.scheme_and_files.each do |url|
62
+ status = [200, 404, 503].sample
62
63
  WebMock.stub_request(:get, url)
63
64
  .to_return(
64
- status: 200,
65
+ status: status,
65
66
  body: File.new(File.join(File.dirname(__FILE__), "fixtures", "htmls", File.basename(url))),
66
67
  headers: {content_type: 'application/html; charset=utf-8'})
67
68
  end
@@ -83,6 +83,7 @@ RSpec.describe WebStat::Fetch do
83
83
  expect(web_stat[:title]).to eq "gem作成でついまずいたところ"
84
84
  expect(web_stat[:site_name]).to eq "newsdict.blog"
85
85
  expect(web_stat[:content]).not_to eq nil
86
+ expect(web_stat[:status]).to eq("200").or eq("404").or eq("503")
86
87
  expect(Sanitize.clean(web_stat[:content]).length).to eq web_stat[:content].length
87
88
  expect(web_stat[:eyecatch_image_path]).to be_tmp_file_or_nil
88
89
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.2.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-01-05 00:00:00.000000000 Z
11
+ date: 2020-06-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -224,7 +224,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
224
224
  - !ruby/object:Gem::Version
225
225
  version: '0'
226
226
  requirements: []
227
- rubygems_version: 3.1.2
227
+ rubygems_version: 3.0.3
228
228
  signing_key:
229
229
  specification_version: 4
230
230
  summary: Get the status of the web pages.