web_stat 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: '09e8b3e9d7829de7767ecfa8136109ec26fd8b8d1534a65a756189b51b7a1c1d'
4
+ data.tar.gz: 004a67e393d63f543cfc5751ec410f308fd074a1e3867e04dfce9cee0002b2af
5
+ SHA512:
6
+ metadata.gz: f24ef29fddc5ac0da80683c1574312797c8cd60004fe9c1b954add1b222cd802577d3b9abd6314b5d29def3dbd5b1db27f568032a9af5e61964b076d2020dc14
7
+ data.tar.gz: b29b88a3984adeedb11c8eeca7aed18d9ef8c7417ef431881caaeffa47a2896ba64b53ad00b9bed5af4bc3cca48e6beeb8a10a6202dbf4fc298287c8427a289c
data/.gitignore ADDED
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
12
+
13
+ # builded gems
14
+ *.gem
15
+ /config
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ sudo: false
3
+ language: ruby
4
+ cache: bundler
5
+ rvm:
6
+ - 2.5.5
7
+ before_install: gem install bundler -v 2.0.1
@@ -0,0 +1,74 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to making participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, gender identity and expression, level of experience,
9
+ nationality, personal appearance, race, religion, or sexual identity and
10
+ orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies both within project spaces and in public spaces
49
+ when an individual is representing the project or its community. Examples of
50
+ representing a project or community include using an official project e-mail
51
+ address, posting via an official social media account, or acting as an appointed
52
+ representative at an online or offline event. Representation of a project may be
53
+ further defined and clarified by project maintainers.
54
+
55
+ ## Enforcement
56
+
57
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
58
+ reported by contacting the project team at yube@newsdict.jp. All
59
+ complaints will be reviewed and investigated and will result in a response that
60
+ is deemed necessary and appropriate to the circumstances. The project team is
61
+ obligated to maintain confidentiality with regard to the reporter of an incident.
62
+ Further details of specific enforcement policies may be posted separately.
63
+
64
+ Project maintainers who do not follow or enforce the Code of Conduct in good
65
+ faith may face temporary or permanent repercussions as determined by other
66
+ members of the project's leadership.
67
+
68
+ ## Attribution
69
+
70
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71
+ available at [http://contributor-covenant.org/version/1/4][version]
72
+
73
+ [homepage]: http://contributor-covenant.org
74
+ [version]: http://contributor-covenant.org/version/1/4/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in web_stat.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,84 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ web_stat (0.1.0)
5
+ bundler (~> 2.0)
6
+ final_redirect_url (~> 0.1.0)
7
+ mechanize (~> 2.7)
8
+ natto (~> 1.1.2)
9
+ nokogiri (~> 1.10)
10
+ ruby-readability (~> 0.7)
11
+ sanitize (~> 5.0.0)
12
+
13
+ GEM
14
+ remote: https://rubygems.org/
15
+ specs:
16
+ connection_pool (2.2.2)
17
+ crass (1.0.4)
18
+ diff-lcs (1.3)
19
+ domain_name (0.5.20180417)
20
+ unf (>= 0.0.5, < 1.0.0)
21
+ ffi (1.10.0)
22
+ final_redirect_url (0.1.0)
23
+ guess_html_encoding (0.0.11)
24
+ http-cookie (1.0.3)
25
+ domain_name (~> 0.5)
26
+ mechanize (2.7.6)
27
+ domain_name (~> 0.5, >= 0.5.1)
28
+ http-cookie (~> 1.0)
29
+ mime-types (>= 1.17.2)
30
+ net-http-digest_auth (~> 1.1, >= 1.1.1)
31
+ net-http-persistent (>= 2.5.2)
32
+ nokogiri (~> 1.6)
33
+ ntlm-http (~> 0.1, >= 0.1.1)
34
+ webrobots (>= 0.0.9, < 0.2)
35
+ mime-types (3.2.2)
36
+ mime-types-data (~> 3.2015)
37
+ mime-types-data (3.2018.0812)
38
+ mini_portile2 (2.4.0)
39
+ natto (1.1.2)
40
+ ffi (>= 1.9.0)
41
+ net-http-digest_auth (1.4.1)
42
+ net-http-persistent (3.0.0)
43
+ connection_pool (~> 2.2)
44
+ nokogiri (1.10.1)
45
+ mini_portile2 (~> 2.4.0)
46
+ nokogumbo (2.0.1)
47
+ nokogiri (~> 1.8, >= 1.8.4)
48
+ ntlm-http (0.1.1)
49
+ rake (10.5.0)
50
+ rspec (3.8.0)
51
+ rspec-core (~> 3.8.0)
52
+ rspec-expectations (~> 3.8.0)
53
+ rspec-mocks (~> 3.8.0)
54
+ rspec-core (3.8.0)
55
+ rspec-support (~> 3.8.0)
56
+ rspec-expectations (3.8.2)
57
+ diff-lcs (>= 1.2.0, < 2.0)
58
+ rspec-support (~> 3.8.0)
59
+ rspec-mocks (3.8.0)
60
+ diff-lcs (>= 1.2.0, < 2.0)
61
+ rspec-support (~> 3.8.0)
62
+ rspec-support (3.8.0)
63
+ ruby-readability (0.7.0)
64
+ guess_html_encoding (>= 0.0.4)
65
+ nokogiri (>= 1.6.0)
66
+ sanitize (5.0.0)
67
+ crass (~> 1.0.2)
68
+ nokogiri (>= 1.8.0)
69
+ nokogumbo (~> 2.0)
70
+ unf (0.1.4)
71
+ unf_ext
72
+ unf_ext (0.0.7.5)
73
+ webrobots (0.1.2)
74
+
75
+ PLATFORMS
76
+ ruby
77
+
78
+ DEPENDENCIES
79
+ rake (~> 10.0)
80
+ rspec (~> 3.0)
81
+ web_stat!
82
+
83
+ BUNDLED WITH
84
+ 2.0.1
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2019 yusuke abe
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,43 @@
1
+ # WebStat
2
+
3
+ Fetch the web pages and stat.
4
+
5
+ ## Requirements
6
+
7
+ - [MeCab _0.996_](http://taku910.github.io/mecab/#download)
8
+ - add runtime dependency
9
+ - "bundler", "~> 2.0"
10
+ - "nokogiri", "~> 1.10"
11
+ - "mechanize", "~> 2.7"
12
+ - "ruby-readability", "~> 0.7"
13
+ - "final_redirect_url", "~> 0.1.0"
14
+ - "natto", "~> 1.1.2"
15
+ - add development dependency
16
+ - "rake", "~> 10.0"
17
+ - "rspec", "~> 3.0"
18
+ spec.add_development_dependency "rake", "~> 10.0"
19
+ spec.add_development_dependency "rspec", "~> 3.0"
20
+
21
+ ## Installation
22
+
23
+ Add this line to your application's Gemfile:
24
+
25
+ ```ruby
26
+ gem 'web_stat'
27
+ ```
28
+
29
+ And then execute:
30
+
31
+ $ bundle
32
+
33
+ Or install it yourself as:
34
+
35
+ $ gem install web_stat
36
+
37
+ ## Usage
38
+
39
+ you can customize web_stat config.
40
+
41
+ And then execute:
42
+
43
+ $ rake web_stat:install
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+ load 'lib/web_stat.rb'
4
+ load "lib/web_stat/tasks/install.rake"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task :default => :spec
@@ -0,0 +1,5 @@
1
+ module WebStat
2
+ class Categorize
3
+ # Unimplemented
4
+ end
5
+ end
@@ -0,0 +1,13 @@
1
+ # Minimum number of characters to detect meta title
2
+ min_length_of_meta_title: 10
3
+ # Split regular expression for titles
4
+ regex_to_sprit_title: '\||-|:|||:|〜|\~| – '
5
+ # User Agent
6
+ user_agent: "web_stat gem agent"
7
+ # Eyecatch image xpaths
8
+ eyecatch_image_xpaths:
9
+ - '/html/head/meta[@property="twitter:image"]/@content'
10
+ - '/html/head/meta[@property="og:image"]/@content'
11
+ - '//img[@class="attachment-post-thumbnail"]/@src'
12
+ - '//div[@id="content"]//img/@src'
13
+ - '//img/@src'
@@ -0,0 +1,30 @@
1
+ require 'yaml'
2
+ module WebStat
3
+ class Configure
4
+ DEFAULT_CONFIG_FILE_PATH = 'config/web_stat.yml'
5
+
6
+ # Get yaml
7
+ def self.get
8
+ YAML.load_file(self.get_configure_path)
9
+ end
10
+
11
+ # Get configure path
12
+ def self.get_configure_path
13
+ if File.exists?(self.get_custom_configure_path)
14
+ self.get_custom_configure_path
15
+ else
16
+ self.get_default_configure_path
17
+ end
18
+ end
19
+
20
+ # Get default configure path
21
+ def self.get_default_configure_path
22
+ File.join(File.expand_path("../", __FILE__), DEFAULT_CONFIG_FILE_PATH)
23
+ end
24
+
25
+ # Get custom configure path
26
+ def self.get_custom_configure_path
27
+ File.join(Bundler.root, DEFAULT_CONFIG_FILE_PATH)
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,3 @@
1
+ module WebStat
2
+ # Unimplemented
3
+ end
@@ -0,0 +1,11 @@
1
+ module WebStat
2
+ class FetchAsHtml < Fetch
3
+
4
+ # initialize class
5
+ # @param [String] html
6
+ def initialize(html)
7
+ @html = html
8
+ @nokogiri = ::Nokogiri::HTML(@html)
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,23 @@
1
+ require 'mechanize'
2
+ module WebStat
3
+ class FetchAsWeb < Fetch
4
+ attr_accessor :url
5
+
6
+ # initialize class
7
+ # @param [String] url
8
+ def initialize(url)
9
+ @url = url
10
+ @html = get_url(url).force_encoding("utf-8")
11
+ @nokogiri = ::Nokogiri::HTML(@html)
12
+ end
13
+
14
+ # Get original url
15
+ def original_url
16
+ if @url.match(/^http/)
17
+ FinalRedirectUrl.final_redirect_url(@url)
18
+ else
19
+ @url
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,92 @@
1
+ require 'uri'
2
+ require 'digest'
3
+ require 'sanitize'
4
+ require 'nokogiri'
5
+ require 'ruby-readability'
6
+ require 'final_redirect_url'
7
+ module WebStat
8
+ class Fetch
9
+ attr_accessor :html, :nokogiri, :original_url
10
+
11
+ # Get title
12
+ # @return [String] title
13
+ def title
14
+ begin
15
+ title = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).first
16
+ if title.length < WebStat::Configure.get["min_length_of_meta_title"]
17
+ title = @nokogiri.css("h1").first.content
18
+ end
19
+ rescue
20
+ title = @nokogiri.title
21
+ end
22
+ title.strip
23
+ end
24
+
25
+ # Get name of domain
26
+ def site_name
27
+ begin
28
+ site_name = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).last
29
+ rescue
30
+ site_name = @nokogiri.title
31
+ end
32
+ site_name.strip
33
+ end
34
+
35
+ # Get main section
36
+ def content
37
+ Sanitize.clean(Readability::Document.new(@html).content)
38
+ end
39
+
40
+ # Get temporary path of image
41
+ def eyecatch_image_path
42
+ path = nil
43
+ WebStat::Configure.get["eyecatch_image_xpaths"].each do |xpath|
44
+ if @nokogiri.xpath(xpath).first.respond_to?(:value)
45
+ path = @nokogiri.xpath(xpath).first.value
46
+ break
47
+ end
48
+ end
49
+ if @url && path.is_a?(String) && !path.match(/^http/)
50
+ if path.match(/^\//)
51
+ path = "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}"
52
+ else
53
+ path = "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}/#{URI.parse(@url).path}/#{path}"
54
+ end
55
+ end
56
+ path
57
+ end
58
+
59
+ # Get local path to save url
60
+ # @param [String] url
61
+ def save_local_path(url)
62
+ return nil if url.nil? || !url.match(/^http/)
63
+ tmp_file = "/tmp/#{Digest::SHA1.hexdigest(url)}"
64
+ File.open(tmp_file, "w") do |_file|
65
+ _file.puts(get_url(url))
66
+ end
67
+ tmp_file
68
+ end
69
+
70
+ # Get url
71
+ # @param [String] url
72
+ def get_url(url)
73
+ agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
74
+ # Enable to read Robots.txt
75
+ agent.robots = true
76
+ agent.get(url, [], nil, { 'Accept-Language' => 'ja'}).body
77
+ end
78
+
79
+ # Get the informations of @url
80
+ def stat
81
+ tag = WebStat::Tag.new(content)
82
+ {
83
+ title: title,
84
+ site_name: site_name,
85
+ content: content,
86
+ url: original_url,
87
+ eyecatch_image_path: save_local_path(eyecatch_image_path),
88
+ tags: tag.nouns
89
+ }
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,22 @@
1
+ require 'natto'
2
+ module WebStat
3
+ class Tag
4
+ attr_accessor :natto_mecab, :article
5
+
6
+ def initialize(article)
7
+ @natto_mecab = Natto::MeCab.new
8
+ @article = article
9
+ end
10
+
11
+ def nouns
12
+ words = Hash.new
13
+ @natto_mecab.parse(@article) do |n|
14
+ if n.feature =~ /固有名詞/ && !n.surface.empty?
15
+ words[n.surface] = 1 unless words[n.surface]
16
+ words[n.surface] += 1
17
+ end
18
+ end
19
+ words.keys
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,13 @@
1
+ require 'rbconfig'
2
+ namespace :web_stat do
3
+ desc "install web_stat"
4
+ task :install do
5
+ p RbConfig::CONFIG
6
+ # install custom configure
7
+ #dirname = File.dirname(WebStat::Configure.get_custom_configure_path)
8
+ #Dir.mkdir(dirname) unless File.directory?(dirname)
9
+ #FileUtils.cp(WebStat::Configure.get_default_configure_path, dirname)
10
+ # install mecab
11
+
12
+ end
13
+ end
@@ -0,0 +1,3 @@
1
+ module WebStat
2
+ VERSION = "0.1.1"
3
+ end
data/lib/web_stat.rb ADDED
@@ -0,0 +1,20 @@
1
+ require "bundler"
2
+ Dir.glob("**/*.rb", base: 'lib').each do |file|
3
+ require(file)
4
+ end
5
+
6
+ module WebStat
7
+ class << self
8
+ # Get web page's stat by url
9
+ def stat_by_url(url)
10
+ web_stat = WebStat::FetchAsWeb.new(url)
11
+ web_stat.stat
12
+ end
13
+
14
+ # Get web page's stat by html
15
+ def stat_by_html(html)
16
+ web_stat = WebStat::FetchAsHtml.new(html)
17
+ web_stat.stat
18
+ end
19
+ end
20
+ end