web_stat 0.2.10 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5821645b40a23008360e4e9717ec9f420fdd5adc5a7b4fdfea6d08de111af27f
4
- data.tar.gz: c78ddd43e9ad8691dd05481e2854e4280bfcda857cb3fa97624c947c6233af75
3
+ metadata.gz: d27f9de72e744c0cff9c1876952b23d7dd86a3d8a8331ee95293ff06d0ad16a9
4
+ data.tar.gz: a9985bd6c7167e70bcffb0f215ce13afbdef85d34790a0a5486b72b7380b01fb
5
5
  SHA512:
6
- metadata.gz: 439ad9a5e969057996d802374739bb587648d6fbc27a9b2fe0da71fdc36345c87b2b483fe474f8d499769d107cab6c335cf41f5575363b2a25edc2c1f88f7cd2
7
- data.tar.gz: 0e6942690a71294c2402111e21605715d3191bae076167febf2df7277d50da3c8ac2682ef319e570b7f3763ce9515cb7410243f901fb9d3d6a5f86bb67177537
6
+ metadata.gz: 2f2a7d199c6cd737bb6b6facd97fd94324898ccb14554e2497d44d567f19c06deb127da3db5be968737d8dffde25b0b498845ebb60661408810891a3c4eea142
7
+ data.tar.gz: 0c1b425aa39397ac13e68009b20c0015d7939ce4af1faabbb2716b22e2cb2c4776aaecd59ab6328113a270c3a4d545f9d2d2d1472373e3f53d083cbcd3833186
data/.gitignore CHANGED
@@ -6,6 +6,7 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
+ /.bundle
9
10
 
10
11
  # rspec failure tracking
11
12
  .rspec_status
@@ -1 +1 @@
1
- 2.7.0
1
+ 2.7.1
@@ -0,0 +1,25 @@
1
+ # Define base image, you can use --build-arg
2
+ ARG base_image="newsdict/rails:ubuntu20.10_nvmv0.35.2_nodev14.3.0_rubyv2.7.1_sasscv2.3.0_ffiv1.13.1_chromedriver"
3
+ FROM $base_image
4
+
5
+ # Set locale
6
+ ENV LANG "C.UTF-8"
7
+ ENV NOKOGIRI_USE_SYSTEM_LIBRARIES "YES"
8
+
9
+ # Set correct environment variables.
10
+ RUN mkdir -p /var/www/docker
11
+ WORKDIR /var/www/docker
12
+
13
+ # Set up application
14
+ COPY . .
15
+
16
+ # Init gems
17
+ RUN echo "gem: --no-rdoc --no-ri" > ~/.gemrc
18
+ RUN . /etc/profile.d/rvm.sh && \
19
+ bundle config --global with 'development test' && \
20
+ bundle config --global system true && \
21
+ bundle config --global jobs 10 && \
22
+ bundle config --global build.nokogiri --use-system-libraries && \
23
+ bundle install
24
+
25
+ CMD ["bash"]
data/Gemfile CHANGED
@@ -1,6 +1,4 @@
1
1
  source "https://rubygems.org"
2
2
 
3
- gem "final_redirect_url", :git => "git@github.com:yubele/final_redirect_url"
4
-
5
3
  # Specify your gem's dependencies in web_stat.gemspec
6
4
  gemspec
@@ -1,13 +1,7 @@
1
- GIT
2
- remote: git@github.com:yubele/final_redirect_url
3
- revision: 45df878ec9495ebbfa06dc0a60cc5043c2519e16
4
- specs:
5
- final_redirect_url (0.1.1)
6
-
7
1
  PATH
8
2
  remote: .
9
3
  specs:
10
- web_stat (0.2.10)
4
+ web_stat (0.3.3)
11
5
  bundler (>= 2.0.2)
12
6
  cld (>= 0.8.0)
13
7
  mechanize (>= 2.7)
@@ -15,23 +9,26 @@ PATH
15
9
  nokogiri (>= 1.10.4)
16
10
  ruby-readability (>= 0.7)
17
11
  sanitize (>= 5.0.0)
12
+ selenium-webdriver (= 3.142.7)
18
13
 
19
14
  GEM
20
15
  remote: https://rubygems.org/
21
16
  specs:
22
17
  addressable (2.7.0)
23
18
  public_suffix (>= 2.0.2, < 5.0)
19
+ byebug (11.1.3)
20
+ childprocess (3.0.0)
24
21
  cld (0.8.0)
25
22
  ffi
26
- coderay (1.1.2)
27
- connection_pool (2.2.2)
23
+ coderay (1.1.3)
24
+ connection_pool (2.2.3)
28
25
  crack (0.4.3)
29
26
  safe_yaml (~> 1.0.0)
30
27
  crass (1.0.6)
31
28
  diff-lcs (1.3)
32
29
  domain_name (0.5.20190701)
33
30
  unf (>= 0.0.5, < 1.0.0)
34
- ffi (1.12.2)
31
+ ffi (1.13.1)
35
32
  guess_html_encoding (0.0.11)
36
33
  hashdiff (1.0.1)
37
34
  http-cookie (1.0.3)
@@ -48,7 +45,7 @@ GEM
48
45
  method_source (1.0.0)
49
46
  mime-types (3.3.1)
50
47
  mime-types-data (~> 3.2015)
51
- mime-types-data (3.2020.0425)
48
+ mime-types-data (3.2020.0512)
52
49
  mini_portile2 (2.4.0)
53
50
  natto (1.2.0)
54
51
  ffi (>= 1.9.0)
@@ -63,7 +60,10 @@ GEM
63
60
  pry (0.13.1)
64
61
  coderay (~> 1.1)
65
62
  method_source (~> 1.0)
66
- public_suffix (4.0.4)
63
+ pry-byebug (3.9.0)
64
+ byebug (~> 11.0)
65
+ pry (~> 0.13.0)
66
+ public_suffix (4.0.5)
67
67
  rake (13.0.1)
68
68
  rspec (3.9.0)
69
69
  rspec-core (~> 3.9.0)
@@ -71,7 +71,7 @@ GEM
71
71
  rspec-mocks (~> 3.9.0)
72
72
  rspec-core (3.9.2)
73
73
  rspec-support (~> 3.9.3)
74
- rspec-expectations (3.9.1)
74
+ rspec-expectations (3.9.2)
75
75
  diff-lcs (>= 1.2.0, < 2.0)
76
76
  rspec-support (~> 3.9.0)
77
77
  rspec-mocks (3.9.1)
@@ -81,11 +81,15 @@ GEM
81
81
  ruby-readability (0.7.0)
82
82
  guess_html_encoding (>= 0.0.4)
83
83
  nokogiri (>= 1.6.0)
84
+ rubyzip (2.3.0)
84
85
  safe_yaml (1.0.5)
85
- sanitize (5.1.0)
86
+ sanitize (5.2.0)
86
87
  crass (~> 1.0.2)
87
88
  nokogiri (>= 1.8.0)
88
89
  nokogumbo (~> 2.0)
90
+ selenium-webdriver (3.142.7)
91
+ childprocess (>= 0.5, < 4.0)
92
+ rubyzip (>= 1.2.2)
89
93
  unf (0.1.4)
90
94
  unf_ext
91
95
  unf_ext (0.0.7.7)
@@ -99,8 +103,8 @@ PLATFORMS
99
103
  ruby
100
104
 
101
105
  DEPENDENCIES
102
- final_redirect_url!
103
- pry (>= 0.12.2)
106
+ pry (>= 0.13.1)
107
+ pry-byebug (= 3.9.0)
104
108
  rake (>= 10.0)
105
109
  rspec (>= 3.0)
106
110
  web_stat!
@@ -0,0 +1,18 @@
1
+ version: "3.8"
2
+ networks:
3
+ app-tier:
4
+ driver: bridge
5
+ services:
6
+ web_stat:
7
+ tty: true
8
+ stdin_open: true
9
+ container_name: web_stat
10
+ build:
11
+ context: .
12
+ dockerfile: Dockerfile
13
+ volumes:
14
+ - ./:/var/www/docker:cached
15
+ working_dir: /var/www/docker
16
+ command: bash
17
+ networks:
18
+ - app-tier
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ # Check to support tty.
3
+ if [ "$(tty>/dev/null;echo $?)" != "0" ];then
4
+ DOCKERCOMPOSE_EXEC="docker-compose exec -T"
5
+ else
6
+ DOCKERCOMPOSE_EXEC="docker-compose exec"
7
+ fi
8
+ $DOCKERCOMPOSE_EXEC web_stat /bin/bash -c ". /etc/profile.d/rvm.sh && . /root/.nvm/nvm.sh && $*"
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env bash
2
+ set -e
3
+ if [ "$1" = "rm" ];then
4
+ shift
5
+ docker-compose rm -f
6
+ fi
7
+ docker-compose stop
8
+ if [ "$1" = "attach" ]; then
9
+ docker-compose up -d
10
+ docker attach $2
11
+ else
12
+ docker-compose up $@
13
+ fi
@@ -0,0 +1,30 @@
1
+ module WebStat
2
+ class WebDriverHelper
3
+ class << self
4
+ # Get last url
5
+ # @param [String] url
6
+ # @param [Integer] delay
7
+ def get_last_url(url, delay=nil)
8
+ Selenium::WebDriver.logger.output = File.join("/tmp", "selenium.log")
9
+ Selenium::WebDriver.logger.level = :info
10
+ options = Selenium::WebDriver::Chrome::Options.new(args: [
11
+ 'headless',
12
+ 'no-sandbox',
13
+ 'disable-gpu',
14
+ 'start-maximized',
15
+ 'window-size=1920,1080'
16
+ ])
17
+ driver = Selenium::WebDriver.for(:chrome, options: options)
18
+ driver.manage.timeouts.implicit_wait = 10
19
+ Selenium::WebDriver::Wait.new(timeout: 10)
20
+ driver.get(url)
21
+ if delay.is_a?(Integer)
22
+ sleep delay
23
+ end
24
+ last_url = driver.current_url
25
+ driver.quit
26
+ last_url
27
+ end
28
+ end
29
+ end
30
+ end
@@ -1,14 +1,18 @@
1
1
  require "bundler"
2
2
 
3
+ require 'cld'
3
4
  require 'uri'
4
5
  require 'digest'
6
+ require 'logger'
5
7
  require 'sanitize'
6
8
  require 'nokogiri'
7
9
  require 'open-uri'
10
+ require 'net/http'
8
11
  require 'ruby-readability'
9
- require 'final_redirect_url'
10
- require 'cld'
12
+ require 'selenium-webdriver'
11
13
 
14
+ require "helpers/web_drive_helper"
15
+ require "web_stat/final_redirect_url"
12
16
  require "web_stat/categorize"
13
17
  require "web_stat/configure"
14
18
  require "web_stat/errors"
@@ -1,14 +1,21 @@
1
- # Minimum number of characters to detect meta title
2
- min_length_of_meta_title: 10
3
- # Split regular expression for titles
4
- regex_to_sprit_title: '\||-|:|||:|〜|\~| '
5
- # User Agent
6
- user_agent: "web_stat gem agent"
7
- # Eyecatch image xpaths
8
- eyecatch_image_xpaths:
9
- - '/html/head/meta[@property="twitter:image"]/@content'
10
- - '/html/head/meta[@property="og:image"]/@content'
11
- - '//img[@class="attachment-post-thumbnail"]/@src'
12
- - '//div[@id="content"]//img/@src'
13
- - '//img/@src'
14
- userdic: ""
1
+ development: &development
2
+ # Minimum number of characters to detect meta title
3
+ min_length_of_meta_title: 10
4
+ # Split regular expression for titles
5
+ regex_to_sprit_title: '\||-|:|||:|〜|\~| – '
6
+ # User Agent
7
+ user_agent: "web_stat gem agent"
8
+ # Eyecatch image xpaths
9
+ eyecatch_image_xpaths:
10
+ - '/html/head/meta[@property="twitter:image"]/@content'
11
+ - '/html/head/meta[@property="og:image"]/@content'
12
+ - '//img[@class="attachment-post-thumbnail"]/@src'
13
+ - '//div[@id="content"]//img/@src'
14
+ - '//img/@src'
15
+ userdic: ""
16
+ use_chromedirver: false
17
+ test:
18
+ <<: *development
19
+ production:
20
+ <<: *development
21
+ use_chromedirver: true
@@ -3,31 +3,37 @@ module WebStat
3
3
  class Configure
4
4
  DEFAULT_CONFIG_FILE_PATH = 'config/web_stat.yml'
5
5
 
6
- # Get yaml
7
- def self.get
8
- YAML.load_file(self.get_configure_path)
9
- end
10
-
11
- # Get configure path
12
- def self.get_configure_path
13
- if File.exists?(self.get_custom_configure_path)
14
- self.get_custom_configure_path
15
- else
16
- self.get_default_configure_path
6
+ class << self
7
+ # Get yaml
8
+ def get
9
+ if defined? Rails
10
+ YAML.load_file(get_configure_path)[Rails.env]
11
+ else
12
+ YAML.load_file(get_configure_path)["production"]
13
+ end
17
14
  end
18
- end
19
-
20
- # Get default configure path
21
- def self.get_default_configure_path
22
- File.join(File.expand_path("../", __FILE__), DEFAULT_CONFIG_FILE_PATH)
23
- end
24
-
25
- # Get custom configure path
26
- def self.get_custom_configure_path
27
- if defined? Rails
28
- File.join(Rails.root, DEFAULT_CONFIG_FILE_PATH)
29
- else
30
- File.join(Bundler.root, DEFAULT_CONFIG_FILE_PATH)
15
+
16
+ # Get configure path
17
+ def get_configure_path
18
+ if File.exists?(get_custom_configure_path)
19
+ get_custom_configure_path
20
+ else
21
+ get_default_configure_path
22
+ end
23
+ end
24
+
25
+ # Get default configure path
26
+ def get_default_configure_path
27
+ File.join(File.expand_path("../", __FILE__), DEFAULT_CONFIG_FILE_PATH)
28
+ end
29
+
30
+ # Get custom configure path
31
+ def get_custom_configure_path
32
+ if defined? Rails
33
+ File.join(Rails.root, DEFAULT_CONFIG_FILE_PATH)
34
+ else
35
+ File.join(Bundler.root, DEFAULT_CONFIG_FILE_PATH)
36
+ end
31
37
  end
32
38
  end
33
39
  end
@@ -13,9 +13,12 @@ module WebStat
13
13
  rescue
14
14
  title = @nokogiri.title
15
15
  end
16
- title.strip
16
+ if title.nil?
17
+ "No Title"
18
+ else
19
+ title.strip
20
+ end
17
21
  end
18
-
19
22
  # Get name of domain
20
23
  def site_name
21
24
  begin
@@ -23,9 +26,12 @@ module WebStat
23
26
  rescue
24
27
  site_name = @nokogiri.title
25
28
  end
26
- site_name.strip
29
+ if site_name.nil?
30
+ "No Sitename"
31
+ else
32
+ site_name.strip
33
+ end
27
34
  end
28
- []
29
35
  # Get main section
30
36
  def content
31
37
  Sanitize.clean(Readability::Document.new(@nokogiri.at('body')).content)
@@ -116,7 +122,7 @@ module WebStat
116
122
  # Get original url
117
123
  # @param [String] url
118
124
  def original_url(url)
119
- last_url = FinalRedirectUrl.final_redirect_url(url)
125
+ last_url = WebStat::FinalRedirectUrl.final_redirect_url(url)
120
126
  unless last_url.nil? || last_url.scrub('').empty?
121
127
  last_url
122
128
  else
@@ -0,0 +1,54 @@
1
+ # ref) https://github.com/indyarocks/final_redirect_url
2
+ # customize
3
+ # Changed
4
+ module WebStat
5
+ class FinalRedirectUrl
6
+ class << self
7
+ def final_redirect_url(url, options={})
8
+ final_url = ''
9
+ if is_valid_url?(url)
10
+ begin
11
+ redirect_lookup_depth = options[:depth].to_i > 0 ? options[:depth].to_i : 10
12
+ response_uri = get_final_redirect_url(url, redirect_lookup_depth)
13
+ final_url = url_string_from_uri(response_uri)
14
+ rescue Exception => ex
15
+ # nothing
16
+ end
17
+ end
18
+ final_url
19
+ end
20
+
21
+ private
22
+ def is_valid_url?(url)
23
+ url.to_s.match? URI::regexp(['http', 'https'])
24
+ end
25
+ def get_final_redirect_url(url, limit = 10)
26
+ return url if limit <= 0
27
+ uri = URI.parse(url)
28
+ response = ::Net::HTTP.get_response(uri)
29
+ if response.class == Net::HTTPOK
30
+ if WebStat::Configure.get["use_chromedirver"]
31
+ return URI.parse(WebStat::WebDriverHelper.get_last_url(uri))
32
+ else
33
+ return URI.parse(uri)
34
+ end
35
+ else
36
+ redirect_location = response['location']
37
+ location_uri = URI.parse(redirect_location)
38
+ if location_uri.host.nil?
39
+ redirect_location = uri.scheme + '://' + uri.host + redirect_location
40
+ end
41
+ warn "redirected to #{redirect_location}"
42
+ get_final_redirect_url(redirect_location, limit - 1)
43
+ end
44
+ end
45
+ def url_string_from_uri(uri)
46
+ url_str = "#{uri.scheme}://#{uri.host}#{uri.request_uri}"
47
+ if uri.fragment
48
+ url_str = url_str + "##{uri.fragment}"
49
+ end
50
+ url_str
51
+ end
52
+ end
53
+ end
54
+ end
@@ -4,7 +4,7 @@ module WebStat
4
4
  attr_accessor :natto_mecab, :article
5
5
 
6
6
  def initialize(article, userdic: nil)
7
- @natto_mecab = Natto::MeCab.new(userdic: userdic)
7
+ @natto_mecab = Natto::MeCab.new(userdic: userdic)
8
8
  @article = article
9
9
  end
10
10
 
@@ -1,3 +1,3 @@
1
1
  module WebStat
2
- VERSION = "0.2.10"
2
+ VERSION = "0.3.3"
3
3
  end
@@ -1,12 +1,18 @@
1
1
  require 'rspec/expectations'
2
2
  require "bundler/setup"
3
3
  require 'pry'
4
+ require 'pry-byebug'
4
5
  require "web_stat"
5
6
 
6
7
  require 'webmock'
7
8
  include WebMock::API
8
9
  WebMock.enable!
9
10
 
11
+ WebMock.disable_net_connect!({
12
+ allow_localhost: true,
13
+ allow: 'chromedriver.storage.googleapis.com'
14
+ })
15
+
10
16
  RSpec.configure do |config|
11
17
  # Enable flags like --only-failures and --next-failure
12
18
  config.example_status_persistence_file_path = ".rspec_status"
@@ -27,9 +27,11 @@ Gem::Specification.new do |spec|
27
27
  spec.add_runtime_dependency "natto", ">= 1.1.2"
28
28
  spec.add_runtime_dependency "sanitize", ">= 5.0.0"
29
29
  spec.add_runtime_dependency "cld", ">= 0.8.0"
30
+ spec.add_runtime_dependency "selenium-webdriver", "= 3.142.7"
30
31
 
31
32
  spec.add_development_dependency "rake", ">= 10.0"
32
33
  spec.add_development_dependency "rspec", ">= 3.0"
33
- spec.add_development_dependency "pry", ">= 0.12.2"
34
+ spec.add_development_dependency "pry", ">= 0.13.1"
34
35
  spec.add_development_dependency "webmock", ">= 3.6.0"
36
+ spec.add_development_dependency "pry-byebug", "3.9.0"
35
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: web_stat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.10
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - yusuke abe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-05-04 00:00:00.000000000 Z
11
+ date: 2020-06-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -108,6 +108,20 @@ dependencies:
108
108
  - - ">="
109
109
  - !ruby/object:Gem::Version
110
110
  version: 0.8.0
111
+ - !ruby/object:Gem::Dependency
112
+ name: selenium-webdriver
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - '='
116
+ - !ruby/object:Gem::Version
117
+ version: 3.142.7
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - '='
123
+ - !ruby/object:Gem::Version
124
+ version: 3.142.7
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: rake
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -142,14 +156,14 @@ dependencies:
142
156
  requirements:
143
157
  - - ">="
144
158
  - !ruby/object:Gem::Version
145
- version: 0.12.2
159
+ version: 0.13.1
146
160
  type: :development
147
161
  prerelease: false
148
162
  version_requirements: !ruby/object:Gem::Requirement
149
163
  requirements:
150
164
  - - ">="
151
165
  - !ruby/object:Gem::Version
152
- version: 0.12.2
166
+ version: 0.13.1
153
167
  - !ruby/object:Gem::Dependency
154
168
  name: webmock
155
169
  requirement: !ruby/object:Gem::Requirement
@@ -164,6 +178,20 @@ dependencies:
164
178
  - - ">="
165
179
  - !ruby/object:Gem::Version
166
180
  version: 3.6.0
181
+ - !ruby/object:Gem::Dependency
182
+ name: pry-byebug
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - '='
186
+ - !ruby/object:Gem::Version
187
+ version: 3.9.0
188
+ type: :development
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - '='
193
+ - !ruby/object:Gem::Version
194
+ version: 3.9.0
167
195
  description: Fetch the web pages and stat.
168
196
  email:
169
197
  - yube@newsdict.jp
@@ -177,12 +205,17 @@ files:
177
205
  - ".ruby-version"
178
206
  - ".travis.yml"
179
207
  - CODE_OF_CONDUCT.md
208
+ - Dockerfile
180
209
  - Gemfile
181
210
  - Gemfile.lock
182
211
  - LICENSE.txt
183
212
  - README.md
184
213
  - Rakefile
185
214
  - bin/fetch_as_html
215
+ - docker-compose.yml
216
+ - docker/exec
217
+ - docker/start
218
+ - lib/helpers/web_drive_helper.rb
186
219
  - lib/web_stat.rb
187
220
  - lib/web_stat/categorize.rb
188
221
  - lib/web_stat/config/web_stat.yml
@@ -191,6 +224,7 @@ files:
191
224
  - lib/web_stat/fetch.rb
192
225
  - lib/web_stat/fetch/fetch_as_html.rb
193
226
  - lib/web_stat/fetch/fetch_as_web.rb
227
+ - lib/web_stat/final_redirect_url.rb
194
228
  - lib/web_stat/tag.rb
195
229
  - lib/web_stat/tasks/install.rake
196
230
  - lib/web_stat/version.rb
@@ -224,7 +258,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
224
258
  - !ruby/object:Gem::Version
225
259
  version: '0'
226
260
  requirements: []
227
- rubygems_version: 3.0.3
261
+ rubygems_version: 3.1.2
228
262
  signing_key:
229
263
  specification_version: 4
230
264
  summary: Get the status of the web pages.