RubyGems - web_stat - Versions diffs - 0.3.11 → 0.3.16 - Mend

web_stat 0.3.11 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/Gemfile.lock +15 -3
data/README.md +4 -6
data/lib/web_stat.rb +1 -0
data/lib/web_stat/configure.rb +1 -1
data/lib/web_stat/fetch.rb +24 -8
data/lib/web_stat/fetch/fetch_as_web.rb +25 -1
data/lib/web_stat/final_redirect_url.rb +1 -1
data/lib/web_stat/version.rb +1 -1
data/spec/fixtures/pdfs/newsdict.blog.pdf +0 -0
data/spec/fixtures/pdfs/rfc2616.pdf +0 -0
data/spec/spec_helper.rb +16 -6
data/spec/web_stat/fetch_spec.rb +101 -1
data/web_stat.gemspec +2 -1
metadata +22 -4

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 7a94017f641fb1f84d67ea5d650c094df5a1d738ccbbf880c112f95da4a2792b
-  data.tar.gz: 7383ba299f9f02bae998a104e84302ea99a11a28c77ad42fa23637eec9e2a922
+  metadata.gz: c332bb9cf67262c2b5d8e3c30c861107bd6ac8e3d86136730864e49b10dabcce
+  data.tar.gz: 712f39109989e917e8af6b84fe40b64ce84009a9de4b3647c03177ecb69bc58d
 SHA512:
-  metadata.gz: 4c7d593118d75755f3db68a9f7995de60c63c5c47ba92c30a4772bc6ade5cefda14c8b44a87475ebc6cb3ab1a0badefcd3df1f6d2038aec714af654a07c89ccd
-  data.tar.gz: 2aa7dc288acd6de6207ee8d3e2833ba2b480b99a6d3ffeed651cce6982bbb557adfe173dfd3af1d10345bb0ad4710f1e4b434272039bb8a988beabb35811c5b4
+  metadata.gz: 3607fbe1e76018e3e523ed26a2777837c539a80f5c68da195996616d3566f6034125f0d49e96db9338ae40aeda5dac000968f6e089a7d96c19a412f14d2acbd2
+  data.tar.gz: 027d3b911cbee0dfdd7a8f85b32d24e4b8c9e2f916a47a2d3ea31b34fe0e2f07b452e00372d2028d511d39fba7df8f26347789fd795103422711d0e979b1d8a2

data/Gemfile.lock CHANGED

@@ -1,12 +1,13 @@
 PATH
   remote: .
   specs:
-    web_stat (0.3.10)
+    web_stat (0.3.15)
       bundler (>= 2.0.2)
       cld (>= 0.8.0)
       mechanize (>= 2.7)
       natto (>= 1.1.2)
       nokogiri (>= 1.10.4)
+      pdf-reader (= 2.4.0)
       ruby-readability (>= 0.7)
       sanitize (>= 5.0.0)
       selenium-webdriver (= 3.142.7)
@@ -14,8 +15,10 @@ PATH
 GEM
   remote: https://rubygems.org/
   specs:
+    Ascii85 (1.0.3)
     addressable (2.7.0)
       public_suffix (>= 2.0.2, < 5.0)
+    afm (0.2.2)
     byebug (11.1.3)
     childprocess (3.0.0)
     cld (0.8.0)
@@ -31,6 +34,7 @@ GEM
     ffi (1.13.1)
     guess_html_encoding (0.0.11)
     hashdiff (1.0.1)
+    hashery (2.1.2)
     http-cookie (1.0.3)
       domain_name (~> 0.5)
     mechanize (2.7.6)
@@ -57,6 +61,12 @@ GEM
     nokogumbo (2.0.2)
       nokogiri (~> 1.8, >= 1.8.4)
     ntlm-http (0.1.1)
+    pdf-reader (2.4.0)
+      Ascii85 (~> 1.0.0)
+      afm (~> 0.2.1)
+      hashery (~> 2.0)
+      ruby-rc4
+      ttfunk
     pry (0.13.1)
       coderay (~> 1.1)
       method_source (~> 1.0)
@@ -78,18 +88,20 @@ GEM
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.9.0)
     rspec-support (3.9.3)
+    ruby-rc4 (0.1.5)
     ruby-readability (0.7.0)
       guess_html_encoding (>= 0.0.4)
       nokogiri (>= 1.6.0)
     rubyzip (2.3.0)
     safe_yaml (1.0.5)
-    sanitize (5.2.0)
+    sanitize (5.2.1)
       crass (~> 1.0.2)
       nokogiri (>= 1.8.0)
       nokogumbo (~> 2.0)
     selenium-webdriver (3.142.7)
       childprocess (>= 0.5, < 4.0)
       rubyzip (>= 1.2.2)
+    ttfunk (1.6.2.1)
     unf (0.1.4)
       unf_ext
     unf_ext (0.0.7.7)
@@ -108,7 +120,7 @@ DEPENDENCIES
   rake (>= 10.0)
   rspec (>= 3.0)
   web_stat!
-  webmock (>= 3.6.0)
+  webmock (>= 3.8.3)
 BUNDLED WITH
    2.1.4

data/README.md CHANGED

@@ -55,12 +55,10 @@ And then execute:
 ### spec
-  $ bundle exec rake spec
-or
-  $ bundle exec rspec
+  $ docker/start -d
+  $ docker/exec ENV=development bundle exec rspec
 Test a file
-  $ bundle exec rspec spec/web_stat/fetch_spec.rb
+  $ docker/start -d
+  $ docker/exec ENV=development bundle exec rspec spec/web_stat/fetch_spec.rb

data/lib/web_stat.rb CHANGED

@@ -8,6 +8,7 @@ require 'sanitize'
 require 'nokogiri'
 require 'open-uri'
 require 'net/http'
+require 'pdf/reader'
 require 'ruby-readability'
 require 'selenium-webdriver'

data/lib/web_stat/configure.rb CHANGED

@@ -9,7 +9,7 @@ module WebStat
         if defined? Rails
           YAML.load_file(get_configure_path)[Rails.env]
         else
-          YAML.load_file(get_configure_path)["production"]
+          YAML.load_file(get_configure_path)[ENV["ENV"] || "production"]
         end
       end

data/lib/web_stat/fetch.rb CHANGED

@@ -1,7 +1,12 @@
 module WebStat
   class Fetch
+    THUMBNAIL_REGEXS = {
+      :youtube => [
+        %r{^https://www.youtube.com/watch\?v=([^&]+)},
+        'http://img.youtube.com/vi/\1/default.jpg'
+        ]
+    }
     attr_accessor :url, :html, :nokogiri, :userdic, :status
     # Get title
     # @return [String] title
     def title
@@ -19,7 +24,8 @@ module WebStat
         title.strip
       end
     end
-    # Get name of domain
+    # Get name of domain
     def site_name
       begin
         site_name = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).last
@@ -36,7 +42,7 @@ module WebStat
     def content
       Sanitize.clean(Readability::Document.new(@nokogiri.at('body')).content)
     end
     # Get temporary path of image
     def eyecatch_image_path
       # Reuse `path` in this method
@@ -47,8 +53,18 @@ module WebStat
           break
         end
       end
-      if path.nil? || path.empty? || @nokogiri.at('body').xpath('//img').first
-        path = @nokogiri.at('body').xpath('//img').first.attr('src')
+      # If there is a thumbnail rule, apply it.
+      THUMBNAIL_REGEXS.each do |provider, v|
+        if @url.match(v[0])
+          return @url.gsub(v[0], v[1])
+        end
+      end
+      readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body')).content)
+      if (path.nil? || path.empty?) && readability_content.xpath('//img').first
+        path =  readability_content.xpath('//img').first.attr('src')
+      end
+      if (path.nil? || path.empty?) && @nokogiri.xpath('//img').first
+        path = @nokogiri.xpath('//img').first.attr('src')
       end
       if ! path.nil? && path.match(/^\//)
         "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}"
@@ -56,7 +72,7 @@ module WebStat
         path
       end
     end
     # Get local path to save url
     # @param [String] url
     def save_local_path(url)
@@ -73,7 +89,7 @@ module WebStat
       end
       tmp_file
     end
     # Get url
     # @param [String] url
     # @param [String] body
@@ -103,7 +119,7 @@ module WebStat
       end
       body
     end
     # Get the informations of @url
     # @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
     def stat(userdics: nil)

data/lib/web_stat/fetch/fetch_as_web.rb CHANGED

@@ -9,7 +9,31 @@ module WebStat
         raise WebStat::INVALID_URL, url
       end
       @url = original_url(url)
-      @html = get_url(@url)
+      if @url.match?(/\.pdf$/)
+        title = nil
+        body = nil
+        URI.open(@url) do |io|
+          reader = PDF::Reader.new(io)
+          if reader.info.key?(:Title)
+            title = reader.info[:Title]
+          else
+            title = File.basename(@url, ".pdf")
+          end
+          body = reader.pages.first.text
+        end
+        @html = <<-"EOS"
+          <html>
+          <head>
+            <title>#{title}</title>
+          </head>
+          <body>
+            #{body}
+          </body>
+          </html>
+        EOS
+      else
+        @html = get_url(@url)
+      end
       @nokogiri = ::Nokogiri::HTML(@html)
     end

data/lib/web_stat/final_redirect_url.rb CHANGED

@@ -11,7 +11,7 @@ module WebStat
             redirect_lookup_depth = options[:depth].to_i > 0 ? options[:depth].to_i : 10
             response_uri = get_final_redirect_url(url, redirect_lookup_depth)
             final_url =  url_string_from_uri(response_uri)
-          rescue Exception => ex
+          rescue => e
             # nothing
           end
         end

data/lib/web_stat/version.rb CHANGED

@@ -1,3 +1,3 @@
 module WebStat
-  VERSION = "0.3.11"
+  VERSION = "0.3.16"
 end

data/spec/fixtures/pdfs/newsdict.blog.pdf ADDED

Binary file

data/spec/fixtures/pdfs/rfc2616.pdf ADDED

Binary file

data/spec/spec_helper.rb CHANGED

@@ -6,13 +6,9 @@ require "web_stat"
 require 'webmock'
 include WebMock::API
+ENV['ENV'] = 'test'
 WebMock.enable!
-WebMock.disable_net_connect!({
-  allow_localhost: true,
-  allow: 'chromedriver.storage.googleapis.com'
-})
 RSpec.configure do |config|
   # Enable flags like --only-failures and --next-failure
   config.example_status_persistence_file_path = ".rspec_status"
@@ -57,7 +53,14 @@ module WebStatTestHelper
     # Get htmls of fixture
     def scheme_and_files
       Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "htmls", "*.html")).map do |file|
-	"https://newsdict.blog/#{File.basename(file)}"
+	      "https://newsdict.blog/#{File.basename(file)}"
+      end
+    end
+    # Get pdfs of fixture
+    def pdfs
+      Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "pdfs", "*.pdf")).map do |file|
+	      "https://newsdict.blog/#{File.basename(file)}"
       end
     end
   end
@@ -72,6 +75,13 @@ WebStatTestHelper.scheme_and_files.each do |url|
       body: File.new(File.join(File.dirname(__FILE__), "fixtures", "htmls", File.basename(url))),
       headers: {content_type: 'application/html; charset=utf-8'})
 end
+WebStatTestHelper.pdfs.each do |url|
+  WebMock.stub_request(:get, url)
+    .to_return(
+      status: 200,
+      body: File.read(File.join(File.dirname(__FILE__), "fixtures", "pdfs", File.basename(url))),
+      headers: {content_type: 'application/pdf'})
+end
 WebMock.stub_request(:get, "https://newsdict.blog/robots.txt")
     .to_return(

data/spec/web_stat/fetch_spec.rb CHANGED

@@ -66,6 +66,106 @@ RSpec.describe WebStat::Fetch do
     end
   end
+  [{fixture: "https://newsdict.blog/rfc2616.pdf", class: WebStat::FetchAsWeb}].each do |fetch|
+    it "Get title by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(web_stat.title).to eq "Microsoft Word"
+    end
+    it "Get site name by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(web_stat.site_name).to eq "RFC2616.doc"
+    end
+    it "Get Document's content by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(web_stat.content).not_to eq nil
+    end
+    it "WebStat content do not include html by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(Sanitize.clean(web_stat.content).length).to eq web_stat.content.length
+    end
+    it "Get eyecatch image blob  by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      unless web_stat.stat[:eyecatch_image_path].nil?
+        image = File.read(web_stat.stat[:eyecatch_image_path])
+        expect(image.encoding.to_s).to eq("UTF-8")
+      end
+    end
+    it "Get eyecatch image path by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      expect(web_stat.eyecatch_image_path).to be_string_or_nil
+    end
+    it "Get language_iso by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      expect(web_stat.stat[:language_code]).to eq("en")
+    end
+    it "Get local path of eyecatch image by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      expect(web_stat.stat[:eyecatch_image_path]).to be_tmp_file_or_nil
+    end
+  end
+  [{fixture: "https://newsdict.blog/newsdict.blog.pdf", class: WebStat::FetchAsWeb}].each do |fetch|
+    it "Get title by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(web_stat.title).to eq "newsdict.blog"
+    end
+    it "Get site name by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(web_stat.site_name).to eq "newsdict.blog"
+    end
+    it "Get Document's content by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(web_stat.content).not_to eq nil
+    end
+    it "WebStat content do not include html by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(Sanitize.clean(web_stat.content).length).to eq web_stat.content.length
+    end
+    it "Get eyecatch image blob  by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      unless web_stat.stat[:eyecatch_image_path].nil?
+        image = File.read(web_stat.stat[:eyecatch_image_path])
+        expect(image.encoding.to_s).to eq("UTF-8")
+      end
+    end
+    it "Get eyecatch image path by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      expect(web_stat.eyecatch_image_path).to be_string_or_nil
+    end
+    it "Get language_iso by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      expect(web_stat.stat[:language_code]).to eq("ja")
+    end
+    it "Get local path of eyecatch image by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      expect(web_stat.stat[:eyecatch_image_path]).to be_tmp_file_or_nil
+    end
+  end
   it "WebStat.stat_by_html" do
     WebStatTestHelper.htmls.each do |fixture|
       web_stat = WebStat.stat_by_html(fixture, "https://newsdict.blog")
@@ -104,4 +204,4 @@ RSpec.describe WebStat::Fetch do
     expect(web_stat_fetch_web_class.url_valid?("https://gxyt4.app.goo.gl/Mn64U")).to be true
     expect(web_stat_fetch_web_class.url_valid?("https://status.cloud.google.com/incident/cloud-functions/19010")).to be true
   end
-end
+end

data/web_stat.gemspec CHANGED

@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
   spec.add_runtime_dependency "sanitize", ">= 5.0.0"
   spec.add_runtime_dependency "cld", ">= 0.8.0"
   spec.add_runtime_dependency "selenium-webdriver", "= 3.142.7"
+  spec.add_runtime_dependency "pdf-reader", "2.4.0"
   spec.add_development_dependency "rake", ">= 10.0"
   spec.add_development_dependency "rspec", ">= 3.0"
   spec.add_development_dependency "pry", ">= 0.13.1"
-  spec.add_development_dependency "webmock", ">= 3.6.0"
+  spec.add_development_dependency "webmock", ">= 3.8.3"
   spec.add_development_dependency "pry-byebug", "3.9.0"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: web_stat
 version: !ruby/object:Gem::Version
-  version: 0.3.11
+  version: 0.3.16
 platform: ruby
 authors:
 - yusuke abe
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-06-21 00:00:00.000000000 Z
+date: 2020-09-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -122,6 +122,20 @@ dependencies:
     - - '='
       - !ruby/object:Gem::Version
         version: 3.142.7
+- !ruby/object:Gem::Dependency
+  name: pdf-reader
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 2.4.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 2.4.0
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
@@ -170,14 +184,14 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 3.6.0
+        version: 3.8.3
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 3.6.0
+        version: 3.8.3
 - !ruby/object:Gem::Dependency
   name: pry-byebug
   requirement: !ruby/object:Gem::Requirement
@@ -233,6 +247,8 @@ files:
 - spec/fixtures/htmls/image.html
 - spec/fixtures/images/facebook-3.jpg
 - spec/fixtures/images/newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png
+- spec/fixtures/pdfs/newsdict.blog.pdf
+- spec/fixtures/pdfs/rfc2616.pdf
 - spec/spec_helper.rb
 - spec/web_stat/configure_spec.rb
 - spec/web_stat/fetch_spec.rb
@@ -268,6 +284,8 @@ test_files:
 - spec/fixtures/htmls/image.html
 - spec/fixtures/images/facebook-3.jpg
 - spec/fixtures/images/newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png
+- spec/fixtures/pdfs/newsdict.blog.pdf
+- spec/fixtures/pdfs/rfc2616.pdf
 - spec/spec_helper.rb
 - spec/web_stat/configure_spec.rb
 - spec/web_stat/fetch_spec.rb