web_stat 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.rspec +3 -0
- data/.travis.yml +7 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +84 -0
- data/LICENSE.txt +21 -0
- data/README.md +43 -0
- data/Rakefile +8 -0
- data/lib/web_stat/categorize.rb +5 -0
- data/lib/web_stat/config/web_stat.yml +13 -0
- data/lib/web_stat/configure.rb +30 -0
- data/lib/web_stat/errors.rb +3 -0
- data/lib/web_stat/fetch/fetch_as_html.rb +11 -0
- data/lib/web_stat/fetch/fetch_as_web.rb +23 -0
- data/lib/web_stat/fetch.rb +92 -0
- data/lib/web_stat/tag.rb +22 -0
- data/lib/web_stat/tasks/install.rake +13 -0
- data/lib/web_stat/version.rb +3 -0
- data/lib/web_stat.rb +20 -0
- data/spec/fixtures/htmls/blog.html +190 -0
- data/spec/fixtures/htmls/h1-title.html +11 -0
- data/spec/fixtures/htmls/image.html +191 -0
- data/spec/spec_helper.rb +53 -0
- data/spec/web_stat/configure_spec.rb +13 -0
- data/spec/web_stat/fetch_spec.rb +69 -0
- data/spec/web_stat/tag_spec.rb +14 -0
- data/spec/web_stat_spec.rb +2 -0
- data/web_stat.gemspec +33 -0
- metadata +207 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: '09e8b3e9d7829de7767ecfa8136109ec26fd8b8d1534a65a756189b51b7a1c1d'
|
4
|
+
data.tar.gz: 004a67e393d63f543cfc5751ec410f308fd074a1e3867e04dfce9cee0002b2af
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f24ef29fddc5ac0da80683c1574312797c8cd60004fe9c1b954add1b222cd802577d3b9abd6314b5d29def3dbd5b1db27f568032a9af5e61964b076d2020dc14
|
7
|
+
data.tar.gz: b29b88a3984adeedb11c8eeca7aed18d9ef8c7417ef431881caaeffa47a2896ba64b53ad00b9bed5af4bc3cca48e6beeb8a10a6202dbf4fc298287c8427a289c
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# Contributor Covenant Code of Conduct
|
2
|
+
|
3
|
+
## Our Pledge
|
4
|
+
|
5
|
+
In the interest of fostering an open and welcoming environment, we as
|
6
|
+
contributors and maintainers pledge to making participation in our project and
|
7
|
+
our community a harassment-free experience for everyone, regardless of age, body
|
8
|
+
size, disability, ethnicity, gender identity and expression, level of experience,
|
9
|
+
nationality, personal appearance, race, religion, or sexual identity and
|
10
|
+
orientation.
|
11
|
+
|
12
|
+
## Our Standards
|
13
|
+
|
14
|
+
Examples of behavior that contributes to creating a positive environment
|
15
|
+
include:
|
16
|
+
|
17
|
+
* Using welcoming and inclusive language
|
18
|
+
* Being respectful of differing viewpoints and experiences
|
19
|
+
* Gracefully accepting constructive criticism
|
20
|
+
* Focusing on what is best for the community
|
21
|
+
* Showing empathy towards other community members
|
22
|
+
|
23
|
+
Examples of unacceptable behavior by participants include:
|
24
|
+
|
25
|
+
* The use of sexualized language or imagery and unwelcome sexual attention or
|
26
|
+
advances
|
27
|
+
* Trolling, insulting/derogatory comments, and personal or political attacks
|
28
|
+
* Public or private harassment
|
29
|
+
* Publishing others' private information, such as a physical or electronic
|
30
|
+
address, without explicit permission
|
31
|
+
* Other conduct which could reasonably be considered inappropriate in a
|
32
|
+
professional setting
|
33
|
+
|
34
|
+
## Our Responsibilities
|
35
|
+
|
36
|
+
Project maintainers are responsible for clarifying the standards of acceptable
|
37
|
+
behavior and are expected to take appropriate and fair corrective action in
|
38
|
+
response to any instances of unacceptable behavior.
|
39
|
+
|
40
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
41
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
42
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
43
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
44
|
+
threatening, offensive, or harmful.
|
45
|
+
|
46
|
+
## Scope
|
47
|
+
|
48
|
+
This Code of Conduct applies both within project spaces and in public spaces
|
49
|
+
when an individual is representing the project or its community. Examples of
|
50
|
+
representing a project or community include using an official project e-mail
|
51
|
+
address, posting via an official social media account, or acting as an appointed
|
52
|
+
representative at an online or offline event. Representation of a project may be
|
53
|
+
further defined and clarified by project maintainers.
|
54
|
+
|
55
|
+
## Enforcement
|
56
|
+
|
57
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
58
|
+
reported by contacting the project team at yube@newsdict.jp. All
|
59
|
+
complaints will be reviewed and investigated and will result in a response that
|
60
|
+
is deemed necessary and appropriate to the circumstances. The project team is
|
61
|
+
obligated to maintain confidentiality with regard to the reporter of an incident.
|
62
|
+
Further details of specific enforcement policies may be posted separately.
|
63
|
+
|
64
|
+
Project maintainers who do not follow or enforce the Code of Conduct in good
|
65
|
+
faith may face temporary or permanent repercussions as determined by other
|
66
|
+
members of the project's leadership.
|
67
|
+
|
68
|
+
## Attribution
|
69
|
+
|
70
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
71
|
+
available at [http://contributor-covenant.org/version/1/4][version]
|
72
|
+
|
73
|
+
[homepage]: http://contributor-covenant.org
|
74
|
+
[version]: http://contributor-covenant.org/version/1/4/
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
web_stat (0.1.0)
|
5
|
+
bundler (~> 2.0)
|
6
|
+
final_redirect_url (~> 0.1.0)
|
7
|
+
mechanize (~> 2.7)
|
8
|
+
natto (~> 1.1.2)
|
9
|
+
nokogiri (~> 1.10)
|
10
|
+
ruby-readability (~> 0.7)
|
11
|
+
sanitize (~> 5.0.0)
|
12
|
+
|
13
|
+
GEM
|
14
|
+
remote: https://rubygems.org/
|
15
|
+
specs:
|
16
|
+
connection_pool (2.2.2)
|
17
|
+
crass (1.0.4)
|
18
|
+
diff-lcs (1.3)
|
19
|
+
domain_name (0.5.20180417)
|
20
|
+
unf (>= 0.0.5, < 1.0.0)
|
21
|
+
ffi (1.10.0)
|
22
|
+
final_redirect_url (0.1.0)
|
23
|
+
guess_html_encoding (0.0.11)
|
24
|
+
http-cookie (1.0.3)
|
25
|
+
domain_name (~> 0.5)
|
26
|
+
mechanize (2.7.6)
|
27
|
+
domain_name (~> 0.5, >= 0.5.1)
|
28
|
+
http-cookie (~> 1.0)
|
29
|
+
mime-types (>= 1.17.2)
|
30
|
+
net-http-digest_auth (~> 1.1, >= 1.1.1)
|
31
|
+
net-http-persistent (>= 2.5.2)
|
32
|
+
nokogiri (~> 1.6)
|
33
|
+
ntlm-http (~> 0.1, >= 0.1.1)
|
34
|
+
webrobots (>= 0.0.9, < 0.2)
|
35
|
+
mime-types (3.2.2)
|
36
|
+
mime-types-data (~> 3.2015)
|
37
|
+
mime-types-data (3.2018.0812)
|
38
|
+
mini_portile2 (2.4.0)
|
39
|
+
natto (1.1.2)
|
40
|
+
ffi (>= 1.9.0)
|
41
|
+
net-http-digest_auth (1.4.1)
|
42
|
+
net-http-persistent (3.0.0)
|
43
|
+
connection_pool (~> 2.2)
|
44
|
+
nokogiri (1.10.1)
|
45
|
+
mini_portile2 (~> 2.4.0)
|
46
|
+
nokogumbo (2.0.1)
|
47
|
+
nokogiri (~> 1.8, >= 1.8.4)
|
48
|
+
ntlm-http (0.1.1)
|
49
|
+
rake (10.5.0)
|
50
|
+
rspec (3.8.0)
|
51
|
+
rspec-core (~> 3.8.0)
|
52
|
+
rspec-expectations (~> 3.8.0)
|
53
|
+
rspec-mocks (~> 3.8.0)
|
54
|
+
rspec-core (3.8.0)
|
55
|
+
rspec-support (~> 3.8.0)
|
56
|
+
rspec-expectations (3.8.2)
|
57
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
58
|
+
rspec-support (~> 3.8.0)
|
59
|
+
rspec-mocks (3.8.0)
|
60
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
61
|
+
rspec-support (~> 3.8.0)
|
62
|
+
rspec-support (3.8.0)
|
63
|
+
ruby-readability (0.7.0)
|
64
|
+
guess_html_encoding (>= 0.0.4)
|
65
|
+
nokogiri (>= 1.6.0)
|
66
|
+
sanitize (5.0.0)
|
67
|
+
crass (~> 1.0.2)
|
68
|
+
nokogiri (>= 1.8.0)
|
69
|
+
nokogumbo (~> 2.0)
|
70
|
+
unf (0.1.4)
|
71
|
+
unf_ext
|
72
|
+
unf_ext (0.0.7.5)
|
73
|
+
webrobots (0.1.2)
|
74
|
+
|
75
|
+
PLATFORMS
|
76
|
+
ruby
|
77
|
+
|
78
|
+
DEPENDENCIES
|
79
|
+
rake (~> 10.0)
|
80
|
+
rspec (~> 3.0)
|
81
|
+
web_stat!
|
82
|
+
|
83
|
+
BUNDLED WITH
|
84
|
+
2.0.1
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2019 yusuke abe
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# WebStat
|
2
|
+
|
3
|
+
Fetch the web pages and stat.
|
4
|
+
|
5
|
+
## Requirements
|
6
|
+
|
7
|
+
- [MeCab _0.996_](http://taku910.github.io/mecab/#download)
|
8
|
+
- add runtime dependency
|
9
|
+
- "bundler", "~> 2.0"
|
10
|
+
- "nokogiri", "~> 1.10"
|
11
|
+
- "mechanize", "~> 2.7"
|
12
|
+
- "ruby-readability", "~> 0.7"
|
13
|
+
- "final_redirect_url", "~> 0.1.0"
|
14
|
+
- "natto", "~> 1.1.2"
|
15
|
+
- add development dependency
|
16
|
+
- "rake", "~> 10.0"
|
17
|
+
- "rspec", "~> 3.0"
|
18
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
19
|
+
spec.add_development_dependency "rspec", "~> 3.0"
|
20
|
+
|
21
|
+
## Installation
|
22
|
+
|
23
|
+
Add this line to your application's Gemfile:
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
gem 'web_stat'
|
27
|
+
```
|
28
|
+
|
29
|
+
And then execute:
|
30
|
+
|
31
|
+
$ bundle
|
32
|
+
|
33
|
+
Or install it yourself as:
|
34
|
+
|
35
|
+
$ gem install web_stat
|
36
|
+
|
37
|
+
## Usage
|
38
|
+
|
39
|
+
you can customize web_stat config.
|
40
|
+
|
41
|
+
And then execute:
|
42
|
+
|
43
|
+
$ rake web_stat:install
|
data/Rakefile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# Minimum number of characters to detect meta title
|
2
|
+
min_length_of_meta_title: 10
|
3
|
+
# Split regular expression for titles
|
4
|
+
regex_to_sprit_title: '\||-|:|||:|〜|\~| – '
|
5
|
+
# User Agent
|
6
|
+
user_agent: "web_stat gem agent"
|
7
|
+
# Eyecatch image xpaths
|
8
|
+
eyecatch_image_xpaths:
|
9
|
+
- '/html/head/meta[@property="twitter:image"]/@content'
|
10
|
+
- '/html/head/meta[@property="og:image"]/@content'
|
11
|
+
- '//img[@class="attachment-post-thumbnail"]/@src'
|
12
|
+
- '//div[@id="content"]//img/@src'
|
13
|
+
- '//img/@src'
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
module WebStat
|
3
|
+
class Configure
|
4
|
+
DEFAULT_CONFIG_FILE_PATH = 'config/web_stat.yml'
|
5
|
+
|
6
|
+
# Get yaml
|
7
|
+
def self.get
|
8
|
+
YAML.load_file(self.get_configure_path)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Get configure path
|
12
|
+
def self.get_configure_path
|
13
|
+
if File.exists?(self.get_custom_configure_path)
|
14
|
+
self.get_custom_configure_path
|
15
|
+
else
|
16
|
+
self.get_default_configure_path
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
# Get default configure path
|
21
|
+
def self.get_default_configure_path
|
22
|
+
File.join(File.expand_path("../", __FILE__), DEFAULT_CONFIG_FILE_PATH)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Get custom configure path
|
26
|
+
def self.get_custom_configure_path
|
27
|
+
File.join(Bundler.root, DEFAULT_CONFIG_FILE_PATH)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
module WebStat
|
3
|
+
class FetchAsWeb < Fetch
|
4
|
+
attr_accessor :url
|
5
|
+
|
6
|
+
# initialize class
|
7
|
+
# @param [String] url
|
8
|
+
def initialize(url)
|
9
|
+
@url = url
|
10
|
+
@html = get_url(url).force_encoding("utf-8")
|
11
|
+
@nokogiri = ::Nokogiri::HTML(@html)
|
12
|
+
end
|
13
|
+
|
14
|
+
# Get original url
|
15
|
+
def original_url
|
16
|
+
if @url.match(/^http/)
|
17
|
+
FinalRedirectUrl.final_redirect_url(@url)
|
18
|
+
else
|
19
|
+
@url
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'digest'
|
3
|
+
require 'sanitize'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'ruby-readability'
|
6
|
+
require 'final_redirect_url'
|
7
|
+
module WebStat
|
8
|
+
class Fetch
|
9
|
+
attr_accessor :html, :nokogiri, :original_url
|
10
|
+
|
11
|
+
# Get title
|
12
|
+
# @return [String] title
|
13
|
+
def title
|
14
|
+
begin
|
15
|
+
title = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).first
|
16
|
+
if title.length < WebStat::Configure.get["min_length_of_meta_title"]
|
17
|
+
title = @nokogiri.css("h1").first.content
|
18
|
+
end
|
19
|
+
rescue
|
20
|
+
title = @nokogiri.title
|
21
|
+
end
|
22
|
+
title.strip
|
23
|
+
end
|
24
|
+
|
25
|
+
# Get name of domain
|
26
|
+
def site_name
|
27
|
+
begin
|
28
|
+
site_name = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).last
|
29
|
+
rescue
|
30
|
+
site_name = @nokogiri.title
|
31
|
+
end
|
32
|
+
site_name.strip
|
33
|
+
end
|
34
|
+
|
35
|
+
# Get main section
|
36
|
+
def content
|
37
|
+
Sanitize.clean(Readability::Document.new(@html).content)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get temporary path of image
|
41
|
+
def eyecatch_image_path
|
42
|
+
path = nil
|
43
|
+
WebStat::Configure.get["eyecatch_image_xpaths"].each do |xpath|
|
44
|
+
if @nokogiri.xpath(xpath).first.respond_to?(:value)
|
45
|
+
path = @nokogiri.xpath(xpath).first.value
|
46
|
+
break
|
47
|
+
end
|
48
|
+
end
|
49
|
+
if @url && path.is_a?(String) && !path.match(/^http/)
|
50
|
+
if path.match(/^\//)
|
51
|
+
path = "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}"
|
52
|
+
else
|
53
|
+
path = "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}/#{URI.parse(@url).path}/#{path}"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
path
|
57
|
+
end
|
58
|
+
|
59
|
+
# Get local path to save url
|
60
|
+
# @param [String] url
|
61
|
+
def save_local_path(url)
|
62
|
+
return nil if url.nil? || !url.match(/^http/)
|
63
|
+
tmp_file = "/tmp/#{Digest::SHA1.hexdigest(url)}"
|
64
|
+
File.open(tmp_file, "w") do |_file|
|
65
|
+
_file.puts(get_url(url))
|
66
|
+
end
|
67
|
+
tmp_file
|
68
|
+
end
|
69
|
+
|
70
|
+
# Get url
|
71
|
+
# @param [String] url
|
72
|
+
def get_url(url)
|
73
|
+
agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
|
74
|
+
# Enable to read Robots.txt
|
75
|
+
agent.robots = true
|
76
|
+
agent.get(url, [], nil, { 'Accept-Language' => 'ja'}).body
|
77
|
+
end
|
78
|
+
|
79
|
+
# Get the informations of @url
|
80
|
+
def stat
|
81
|
+
tag = WebStat::Tag.new(content)
|
82
|
+
{
|
83
|
+
title: title,
|
84
|
+
site_name: site_name,
|
85
|
+
content: content,
|
86
|
+
url: original_url,
|
87
|
+
eyecatch_image_path: save_local_path(eyecatch_image_path),
|
88
|
+
tags: tag.nouns
|
89
|
+
}
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
data/lib/web_stat/tag.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'natto'
|
2
|
+
module WebStat
|
3
|
+
class Tag
|
4
|
+
attr_accessor :natto_mecab, :article
|
5
|
+
|
6
|
+
def initialize(article)
|
7
|
+
@natto_mecab = Natto::MeCab.new
|
8
|
+
@article = article
|
9
|
+
end
|
10
|
+
|
11
|
+
def nouns
|
12
|
+
words = Hash.new
|
13
|
+
@natto_mecab.parse(@article) do |n|
|
14
|
+
if n.feature =~ /固有名詞/ && !n.surface.empty?
|
15
|
+
words[n.surface] = 1 unless words[n.surface]
|
16
|
+
words[n.surface] += 1
|
17
|
+
end
|
18
|
+
end
|
19
|
+
words.keys
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'rbconfig'
|
2
|
+
namespace :web_stat do
|
3
|
+
desc "install web_stat"
|
4
|
+
task :install do
|
5
|
+
p RbConfig::CONFIG
|
6
|
+
# install custom configure
|
7
|
+
#dirname = File.dirname(WebStat::Configure.get_custom_configure_path)
|
8
|
+
#Dir.mkdir(dirname) unless File.directory?(dirname)
|
9
|
+
#FileUtils.cp(WebStat::Configure.get_default_configure_path, dirname)
|
10
|
+
# install mecab
|
11
|
+
|
12
|
+
end
|
13
|
+
end
|
data/lib/web_stat.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require "bundler"
|
2
|
+
Dir.glob("**/*.rb", base: 'lib').each do |file|
|
3
|
+
require(file)
|
4
|
+
end
|
5
|
+
|
6
|
+
module WebStat
|
7
|
+
class << self
|
8
|
+
# Get web page's stat by url
|
9
|
+
def stat_by_url(url)
|
10
|
+
web_stat = WebStat::FetchAsWeb.new(url)
|
11
|
+
web_stat.stat
|
12
|
+
end
|
13
|
+
|
14
|
+
# Get web page's stat by html
|
15
|
+
def stat_by_html(html)
|
16
|
+
web_stat = WebStat::FetchAsHtml.new(html)
|
17
|
+
web_stat.stat
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|