web_stat 0.2.11 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bcfaeb202076ea30cae6205877d0b0ad9060eb84116b84ef7bd3580cc4349aac
4
- data.tar.gz: 98e586440c8f3aed29e38a003f6d1afc96b26b2ae4c21e43d47e1dfaf6aaa16a
3
+ metadata.gz: 746fe3666bc5a47c315e0499fd23dc84319142a8d850b872e5118553b1e711b6
4
+ data.tar.gz: d2335c2d0324fc81d5b891e7cf383646f8573ef0147ac4b34672add61ecdb6c6
5
5
  SHA512:
6
- metadata.gz: dfbe6264256f08550ebb42244d92bd81c976dd5dd72a6a4dabffdf6a1366a8f010e8a20527d0bbe7a8334cdb028e86ecae0c63d6cc4368741abe815f1fcb3092
7
- data.tar.gz: 9098d904f26dfdfe14c87352cb47ca3f0333f5424ef26d2cd8232c088c2cf8d7dcbeb07012a9294a5c4e4949b51fe71e94d0995465400d87cf7c037fd09ba978
6
+ metadata.gz: d23e332cb9114aec4c7302aafb1ed4ca6fc081b3b71a21586779ec5d67de74a196508c38dd70bb8c1e5af3828ced4bdeda7f7c86ea84336fbd6fc476ba87de31
7
+ data.tar.gz: 9c4e7955432b59b8f66edf71d814412e6eb7720c81f1c647e9f9420f7592febbcfe349c7ceb7a2b6d371c1b7d1de7d98148f7c015d5cc1f4ef52cf08a84afcbe
data/.gitignore CHANGED
@@ -6,6 +6,7 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
+ /.bundle
9
10
 
10
11
  # rspec failure tracking
11
12
  .rspec_status
@@ -1 +1 @@
1
- 2.7.0
1
+ 2.7.1
@@ -0,0 +1,25 @@
1
+ # Define base image, you can use --build-arg
2
+ ARG base_image="newsdict/rails:ubuntu20.10_nvmv0.35.2_nodev14.3.0_rubyv2.7.1_sasscv2.3.0_ffiv1.13.1_chromedriver"
3
+ FROM $base_image
4
+
5
+ # Set locale
6
+ ENV LANG "C.UTF-8"
7
+ ENV NOKOGIRI_USE_SYSTEM_LIBRARIES "YES"
8
+
9
+ # Set correct environment variables.
10
+ RUN mkdir -p /var/www/docker
11
+ WORKDIR /var/www/docker
12
+
13
+ # Set up application
14
+ COPY . .
15
+
16
+ # Init gems
17
+ RUN echo "gem: --no-rdoc --no-ri" > ~/.gemrc
18
+ RUN . /etc/profile.d/rvm.sh && \
19
+ bundle config --global with 'development test' && \
20
+ bundle config --global system true && \
21
+ bundle config --global jobs 10 && \
22
+ bundle config --global build.nokogiri --use-system-libraries && \
23
+ bundle install
24
+
25
+ CMD ["bash"]
data/Gemfile CHANGED
@@ -1,6 +1,4 @@
1
1
  source "https://rubygems.org"
2
2
 
3
- gem "final_redirect_url", :git => "git@github.com:yubele/final_redirect_url"
4
-
5
3
  # Specify your gem's dependencies in web_stat.gemspec
6
4
  gemspec
@@ -1,13 +1,7 @@
1
- GIT
2
- remote: git@github.com:yubele/final_redirect_url
3
- revision: 45df878ec9495ebbfa06dc0a60cc5043c2519e16
4
- specs:
5
- final_redirect_url (0.1.1)
6
-
7
1
  PATH
8
2
  remote: .
9
3
  specs:
10
- web_stat (0.2.11)
4
+ web_stat (0.3.4)
11
5
  bundler (>= 2.0.2)
12
6
  cld (>= 0.8.0)
13
7
  mechanize (>= 2.7)
@@ -15,23 +9,26 @@ PATH
15
9
  nokogiri (>= 1.10.4)
16
10
  ruby-readability (>= 0.7)
17
11
  sanitize (>= 5.0.0)
12
+ selenium-webdriver (= 3.142.7)
18
13
 
19
14
  GEM
20
15
  remote: https://rubygems.org/
21
16
  specs:
22
17
  addressable (2.7.0)
23
18
  public_suffix (>= 2.0.2, < 5.0)
19
+ byebug (11.1.3)
20
+ childprocess (3.0.0)
24
21
  cld (0.8.0)
25
22
  ffi
26
- coderay (1.1.2)
27
- connection_pool (2.2.2)
23
+ coderay (1.1.3)
24
+ connection_pool (2.2.3)
28
25
  crack (0.4.3)
29
26
  safe_yaml (~> 1.0.0)
30
27
  crass (1.0.6)
31
28
  diff-lcs (1.3)
32
29
  domain_name (0.5.20190701)
33
30
  unf (>= 0.0.5, < 1.0.0)
34
- ffi (1.12.2)
31
+ ffi (1.13.1)
35
32
  guess_html_encoding (0.0.11)
36
33
  hashdiff (1.0.1)
37
34
  http-cookie (1.0.3)
@@ -48,7 +45,7 @@ GEM
48
45
  method_source (1.0.0)
49
46
  mime-types (3.3.1)
50
47
  mime-types-data (~> 3.2015)
51
- mime-types-data (3.2020.0425)
48
+ mime-types-data (3.2020.0512)
52
49
  mini_portile2 (2.4.0)
53
50
  natto (1.2.0)
54
51
  ffi (>= 1.9.0)
@@ -63,7 +60,10 @@ GEM
63
60
  pry (0.13.1)
64
61
  coderay (~> 1.1)
65
62
  method_source (~> 1.0)
66
- public_suffix (4.0.4)
63
+ pry-byebug (3.9.0)
64
+ byebug (~> 11.0)
65
+ pry (~> 0.13.0)
66
+ public_suffix (4.0.5)
67
67
  rake (13.0.1)
68
68
  rspec (3.9.0)
69
69
  rspec-core (~> 3.9.0)
@@ -71,7 +71,7 @@ GEM
71
71
  rspec-mocks (~> 3.9.0)
72
72
  rspec-core (3.9.2)
73
73
  rspec-support (~> 3.9.3)
74
- rspec-expectations (3.9.1)
74
+ rspec-expectations (3.9.2)
75
75
  diff-lcs (>= 1.2.0, < 2.0)
76
76
  rspec-support (~> 3.9.0)
77
77
  rspec-mocks (3.9.1)
@@ -81,11 +81,15 @@ GEM
81
81
  ruby-readability (0.7.0)
82
82
  guess_html_encoding (>= 0.0.4)
83
83
  nokogiri (>= 1.6.0)
84
+ rubyzip (2.3.0)
84
85
  safe_yaml (1.0.5)
85
- sanitize (5.1.0)
86
+ sanitize (5.2.0)
86
87
  crass (~> 1.0.2)
87
88
  nokogiri (>= 1.8.0)
88
89
  nokogumbo (~> 2.0)
90
+ selenium-webdriver (3.142.7)
91
+ childprocess (>= 0.5, < 4.0)
92
+ rubyzip (>= 1.2.2)
89
93
  unf (0.1.4)
90
94
  unf_ext
91
95
  unf_ext (0.0.7.7)
@@ -99,8 +103,8 @@ PLATFORMS
99
103
  ruby
100
104
 
101
105
  DEPENDENCIES
102
- final_redirect_url!
103
- pry (>= 0.12.2)
106
+ pry (>= 0.13.1)
107
+ pry-byebug (= 3.9.0)
104
108
  rake (>= 10.0)
105
109
  rspec (>= 3.0)
106
110
  web_stat!
@@ -0,0 +1,18 @@
1
+ version: "3.8"
2
+ networks:
3
+ app-tier:
4
+ driver: bridge
5
+ services:
6
+ web_stat:
7
+ tty: true
8
+ stdin_open: true
9
+ container_name: web_stat
10
+ build:
11
+ context: .
12
+ dockerfile: Dockerfile
13
+ volumes:
14
+ - ./:/var/www/docker:cached
15
+ working_dir: /var/www/docker
16
+ command: bash
17
+ networks:
18
+ - app-tier
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ # Check to support tty.
3
+ if [ "$(tty>/dev/null;echo $?)" != "0" ];then
4
+ DOCKERCOMPOSE_EXEC="docker-compose exec -T"
5
+ else
6
+ DOCKERCOMPOSE_EXEC="docker-compose exec"
7
+ fi
8
+ $DOCKERCOMPOSE_EXEC web_stat /bin/bash -c ". /etc/profile.d/rvm.sh && . /root/.nvm/nvm.sh && $*"
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env bash
2
+ set -e
3
+ if [ "$1" = "rm" ];then
4
+ shift
5
+ docker-compose rm -f
6
+ fi
7
+ docker-compose stop
8
+ if [ "$1" = "attach" ]; then
9
+ docker-compose up -d
10
+ docker attach $2
11
+ else
12
+ docker-compose up $@
13
+ fi
@@ -0,0 +1,30 @@
1
+ module WebStat
2
+ class WebDriverHelper
3
+ class << self
4
+ # Get last url
5
+ # @param [String] url
6
+ # @param [Integer] delay
7
+ def get_last_url(url, delay=nil)
8
+ Selenium::WebDriver.logger.output = File.join("/tmp", "selenium.log")
9
+ Selenium::WebDriver.logger.level = :info
10
+ options = Selenium::WebDriver::Chrome::Options.new(args: [
11
+ 'headless',
12
+ 'no-sandbox',
13
+ 'disable-gpu',
14
+ 'start-maximized',
15
+ 'window-size=1920,1080'
16
+ ])
17
+ driver = Selenium::WebDriver.for(:chrome, options: options)
18
+ driver.manage.timeouts.implicit_wait = 10
19
+ Selenium::WebDriver::Wait.new(timeout: 10)
20
+ driver.get(url)
21
+ if delay.is_a?(Integer)
22
+ sleep delay
23
+ end
24
+ last_url = driver.current_url
25
+ driver.quit
26
+ last_url
27
+ end
28
+ end
29
+ end
30
+ end
@@ -1,14 +1,18 @@
1
1
  require "bundler"
2
2
 
3
+ require 'cld'
3
4
  require 'uri'
4
5
  require 'digest'
6
+ require 'logger'
5
7
  require 'sanitize'
6
8
  require 'nokogiri'
7
9
  require 'open-uri'
10
+ require 'net/http'
8
11
  require 'ruby-readability'
9
- require 'final_redirect_url'
10
- require 'cld'
12
+ require 'selenium-webdriver'
11
13
 
14
+ require "helpers/web_drive_helper"
15
+ require "web_stat/final_redirect_url"
12
16
  require "web_stat/categorize"
13
17
  require "web_stat/configure"
14
18
  require "web_stat/errors"
@@ -1,14 +1,21 @@
1
- # Minimum number of characters to detect meta title
2
- min_length_of_meta_title: 10
3
- # Split regular expression for titles
4
- regex_to_sprit_title: '\||-|:|||:|〜|\~| '
5
- # User Agent
6
- user_agent: "web_stat gem agent"
7
- # Eyecatch image xpaths
8
- eyecatch_image_xpaths:
9
- - '/html/head/meta[@property="twitter:image"]/@content'
10
- - '/html/head/meta[@property="og:image"]/@content'
11
- - '//img[@class="attachment-post-thumbnail"]/@src'
12
- - '//div[@id="content"]//img/@src'
13
- - '//img/@src'
14
- userdic: ""
1
+ development: &development
2
+ # Minimum number of characters to detect meta title
3
+ min_length_of_meta_title: 10
4
+ # Split regular expression for titles
5
+ regex_to_sprit_title: '\||-|:|||:|〜|\~| – '
6
+ # User Agent
7
+ user_agent: "web_stat gem agent"
8
+ # Eyecatch image xpaths
9
+ eyecatch_image_xpaths:
10
+ - '/html/head/meta[@property="twitter:image"]/@content'
11
+ - '/html/head/meta[@property="og:image"]/@content'
12
+ - '//img[@class="attachment-post-thumbnail"]/@src'
13
+ - '//div[@id="content"]//img/@src'
14
+ - '//img/@src'
15
+ userdic: ""
16
+ use_chromedirver: false
17
+ test:
18
+ <<: *development
19
+ production:
20
+ <<: *development
21
+ use_chromedirver: true
@@ -3,31 +3,37 @@ module WebStat
3
3
  class Configure
4
4
  DEFAULT_CONFIG_FILE_PATH = 'config/web_stat.yml'
5
5
 
6
- # Get yaml
7
- def self.get
8
- YAML.load_file(self.get_configure_path)
9
- end
10
-
11
- # Get configure path
12
- def self.get_configure_path
13
- if File.exists?(self.get_custom_configure_path)
14
- self.get_custom_configure_path
15
- else
16
- self.get_default_configure_path
6
+ class << self
7
+ # Get yaml
8
+ def get
9
+ if defined? Rails
10
+ YAML.load_file(get_configure_path)[Rails.env]
11
+ else
12
+ YAML.load_file(get_configure_path)["production"]
13
+ end
17
14
  end
18
- end
19
-
20
- # Get default configure path
21
- def self.get_default_configure_path
22
- File.join(File.expand_path("../", __FILE__), DEFAULT_CONFIG_FILE_PATH)
23
- end
24
-
25
- # Get custom configure path
26
- def self.get_custom_configure_path
27
- if defined? Rails
28
- File.join(Rails.root, DEFAULT_CONFIG_FILE_PATH)
29
- else
30
- File.join(Bundler.root, DEFAULT_CONFIG_FILE_PATH)
15
+
16
+ # Get configure path
17
+ def get_configure_path
18
+ if File.exists?(get_custom_configure_path)
19
+ get_custom_configure_path
20
+ else
21
+ get_default_configure_path
22
+ end
23
+ end
24
+
25
+ # Get default configure path
26
+ def get_default_configure_path
27
+ File.join(File.expand_path("../", __FILE__), DEFAULT_CONFIG_FILE_PATH)
28
+ end
29
+
30
+ # Get custom configure path
31
+ def get_custom_configure_path
32
+ if defined? Rails
33
+ File.join(Rails.root, DEFAULT_CONFIG_FILE_PATH)
34
+ else
35
+ File.join(Bundler.root, DEFAULT_CONFIG_FILE_PATH)
36
+ end
31
37
  end
32
38
  end
33
39
  end
@@ -19,7 +19,6 @@ module WebStat
19
19
  title.strip
20
20
  end
21
21
  end
22
-
23
22
  # Get name of domain
24
23
  def site_name
25
24
  begin
@@ -33,7 +32,6 @@ module WebStat
33
32
  site_name.strip
34
33
  end
35
34
  end
36
- []
37
35
  # Get main section
38
36
  def content
39
37
  Sanitize.clean(Readability::Document.new(@nokogiri.at('body')).content)
@@ -59,7 +57,7 @@ module WebStat
59
57
  # Get local path to save url
60
58
  # @param [String] url
61
59
  def save_local_path(url)
62
- return nil if url.nil?
60
+ return nil if url.nil? || url.empty?
63
61
  tmp_file = "/tmp/#{Digest::SHA1.hexdigest(url)}"
64
62
  agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
65
63
  image = agent.get(url)
@@ -124,7 +122,7 @@ module WebStat
124
122
  # Get original url
125
123
  # @param [String] url
126
124
  def original_url(url)
127
- last_url = FinalRedirectUrl.final_redirect_url(url)
125
+ last_url = WebStat::FinalRedirectUrl.final_redirect_url(url)
128
126
  unless last_url.nil? || last_url.scrub('').empty?
129
127
  last_url
130
128
  else
@@ -0,0 +1,54 @@
1
+ # ref) https://github.com/indyarocks/final_redirect_url
2
+ # customize
3
+ # Changed
4
+ module WebStat
5
+ class FinalRedirectUrl
6
+ class << self
7
+ def final_redirect_url(url, options={})
8
+ final_url = ''
9
+ if is_valid_url?(url)
10
+ begin
11
+ redirect_lookup_depth = options[:depth].to_i > 0 ? options[:depth].to_i : 10
12
+ response_uri = get_final_redirect_url(url, redirect_lookup_depth)
13
+ final_url = url_string_from_uri(response_uri)
14
+ rescue Exception => ex
15
+ # nothing
16
+ end
17
+ end
18
+ final_url
19
+ end
20
+
21
+ private
22
+ def is_valid_url?(url)
23
+ url.to_s.match? URI::regexp(['http', 'https'])
24
+ end
25
+ def get_final_redirect_url(url, limit = 10)
26
+ return url if limit <= 0
27
+ uri = URI.parse(url)
28
+ response = ::Net::HTTP.get_response(uri)
29
+ if response.class == Net::HTTPOK
30
+ if WebStat::Configure.get["use_chromedirver"]
31
+ return URI.parse(WebStat::WebDriverHelper.get_last_url(uri))
32
+ else
33
+ return URI.parse(uri)
34
+ end
35
+ else
36
+ redirect_location = response['location']
37
+ location_uri = URI.parse(redirect_location)
38
+ if location_uri.host.nil?
39
+ redirect_location = uri.scheme + '://' + uri.host + redirect_location
40
+ end
41
+ warn "redirected to #{redirect_location}"
42
+ get_final_redirect_url(redirect_location, limit - 1)
43
+ end
44
+ end
45
+ def url_string_from_uri(uri)
46
+ url_str = "#{uri.scheme}://#{uri.host}#{uri.request_uri}"
47
+ if uri.fragment
48
+ url_str = url_str + "##{uri.fragment}"
49
+ end
50
+ url_str
51
+ end
52
+ end
53
+ end
54
+ end
@@ -4,7 +4,7 @@ module WebStat
4
4
  attr_accessor :natto_mecab, :article
5
5
 
6
6
  def initialize(article, userdic: nil)
7
- @natto_mecab = Natto::MeCab.new(userdic: userdic)
7
+ @natto_mecab = Natto::MeCab.new(userdic: userdic)
8
8
  @article = article
9
9
  end
10
10
 
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.2.11"
2
+ VERSION = "0.3.4"
3
3
  end
@@ -1,12 +1,18 @@
1
1
  require 'rspec/expectations'
2
2
  require "bundler/setup"
3
3
  require 'pry'
4
+ require 'pry-byebug'
4
5
  require "web_stat"
5
6
 
6
7
  require 'webmock'
7
8
  include WebMock::API
8
9
  WebMock.enable!
9
10
 
11
+ WebMock.disable_net_connect!({
12
+ allow_localhost: true,
13
+ allow: 'chromedriver.storage.googleapis.com'
14
+ })
15
+
10
16
  RSpec.configure do |config|
11
17
  # Enable flags like --only-failures and --next-failure
12
18
  config.example_status_persistence_file_path = ".rspec_status"
@@ -27,9 +27,11 @@ Gem::Specification.new do |spec|
27
27
  spec.add_runtime_dependency "natto", ">= 1.1.2"
28
28
  spec.add_runtime_dependency "sanitize", ">= 5.0.0"
29
29
  spec.add_runtime_dependency "cld", ">= 0.8.0"
30
+ spec.add_runtime_dependency "selenium-webdriver", "= 3.142.7"
30
31
 
31
32
  spec.add_development_dependency "rake", ">= 10.0"
32
33
  spec.add_development_dependency "rspec", ">= 3.0"
33
- spec.add_development_dependency "pry", ">= 0.12.2"
34
+ spec.add_development_dependency "pry", ">= 0.13.1"
34
35
  spec.add_development_dependency "webmock", ">= 3.6.0"
36
+ spec.add_development_dependency "pry-byebug", "3.9.0"
35
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.11
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-09 00:00:00.000000000 Z
11
+ date: 2020-06-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -108,6 +108,20 @@ dependencies:
108
108
  - - ">="
109
109
  - !ruby/object:Gem::Version
110
110
  version: 0.8.0
111
+ - !ruby/object:Gem::Dependency
112
+ name: selenium-webdriver
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - '='
116
+ - !ruby/object:Gem::Version
117
+ version: 3.142.7
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - '='
123
+ - !ruby/object:Gem::Version
124
+ version: 3.142.7
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: rake
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -142,14 +156,14 @@ dependencies:
142
156
  requirements:
143
157
  - - ">="
144
158
  - !ruby/object:Gem::Version
145
- version: 0.12.2
159
+ version: 0.13.1
146
160
  type: :development
147
161
  prerelease: false
148
162
  version_requirements: !ruby/object:Gem::Requirement
149
163
  requirements:
150
164
  - - ">="
151
165
  - !ruby/object:Gem::Version
152
- version: 0.12.2
166
+ version: 0.13.1
153
167
  - !ruby/object:Gem::Dependency
154
168
  name: webmock
155
169
  requirement: !ruby/object:Gem::Requirement
@@ -164,6 +178,20 @@ dependencies:
164
178
  - - ">="
165
179
  - !ruby/object:Gem::Version
166
180
  version: 3.6.0
181
+ - !ruby/object:Gem::Dependency
182
+ name: pry-byebug
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - '='
186
+ - !ruby/object:Gem::Version
187
+ version: 3.9.0
188
+ type: :development
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - '='
193
+ - !ruby/object:Gem::Version
194
+ version: 3.9.0
167
195
  description: Fetch the web pages and stat.
168
196
  email:
169
197
  - yube@newsdict.jp
@@ -177,12 +205,17 @@ files:
177
205
  - ".ruby-version"
178
206
  - ".travis.yml"
179
207
  - CODE_OF_CONDUCT.md
208
+ - Dockerfile
180
209
  - Gemfile
181
210
  - Gemfile.lock
182
211
  - LICENSE.txt
183
212
  - README.md
184
213
  - Rakefile
185
214
  - bin/fetch_as_html
215
+ - docker-compose.yml
216
+ - docker/exec
217
+ - docker/start
218
+ - lib/helpers/web_drive_helper.rb
186
219
  - lib/web_stat.rb
187
220
  - lib/web_stat/categorize.rb
188
221
  - lib/web_stat/config/web_stat.yml
@@ -191,6 +224,7 @@ files:
191
224
  - lib/web_stat/fetch.rb
192
225
  - lib/web_stat/fetch/fetch_as_html.rb
193
226
  - lib/web_stat/fetch/fetch_as_web.rb
227
+ - lib/web_stat/final_redirect_url.rb
194
228
  - lib/web_stat/tag.rb
195
229
  - lib/web_stat/tasks/install.rake
196
230
  - lib/web_stat/version.rb
@@ -224,7 +258,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
224
258
  - !ruby/object:Gem::Version
225
259
  version: '0'
226
260
  requirements: []
227
- rubygems_version: 3.0.3
261
+ rubygems_version: 3.1.2
228
262
  signing_key:
229
263
  specification_version: 4
230
264
  summary: Get the status of the web pages.