RubyGems - web_stat - Versions diffs - 0.3.14 → 0.3.19 - Mend

web_stat 0.3.14 → 0.3.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/Gemfile.lock +16 -4
data/lib/web_stat.rb +1 -0
data/lib/web_stat/config/web_stat.yml +5 -1
data/lib/web_stat/fetch.rb +14 -8
data/lib/web_stat/fetch/fetch_as_web.rb +25 -1
data/lib/web_stat/version.rb +1 -1
data/spec/fixtures/pdfs/newsdict.blog.pdf +0 -0
data/spec/fixtures/pdfs/rfc2616.pdf +0 -0
data/spec/spec_helper.rb +17 -2
data/spec/web_stat/configure_spec.rb +17 -2
data/spec/web_stat/fetch_spec.rb +101 -1
data/web_stat.gemspec +2 -1
metadata +22 -4

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 549db2077648ce028b556a72126335f05336155f86c4b18d47856f219c71fff0
-  data.tar.gz: be8e0cee272fc20013659346608bf9a50a69dc48e777a46214c29f5c1865232d
+  metadata.gz: 2893819d947835e7cc92b35361c487ede791e0bd87ac7aa94bb0d0c3e28780a0
+  data.tar.gz: 35d90e33f07dc24fabeabca7aef669519053781ff71ddeec12ca85ee52521c3b
 SHA512:
-  metadata.gz: c78fa085f475c7cdf0747b4c777e357503ce41929bbe5462ccda7b28a6cf4f20a5394deb7853ee11570a4b3338573e392f8697e61484649fe06beadc54aa38a8
-  data.tar.gz: 941f0de20548a37899ac7610bd95b381ac18ca71761ccfb7f7788c299dd41ff521ae7eea283df64ee624044f919d2951f396ec38c3c4a15ded378ee0acbd0a20
+  metadata.gz: 6f576937d619990b2ccb72bf016975521065e052605c7c4835f233f0f58da866836cb961ea8bd8d213bebee84671717d73b35cc82a0a49dcef9ad04f276cacd3
+  data.tar.gz: 87711062737c14f00523ea51a91de50f5e9a00364c4df73a61bd49fcfa5559ae7b995718168b2fa48346d0f3e25af16aa98df16e56b4c6f1dea7ebf74a629664

data/Gemfile.lock CHANGED

@@ -1,12 +1,13 @@
 PATH
   remote: .
   specs:
-    web_stat (0.3.12)
+    web_stat (0.3.19)
       bundler (>= 2.0.2)
       cld (>= 0.8.0)
       mechanize (>= 2.7)
       natto (>= 1.1.2)
       nokogiri (>= 1.10.4)
+      pdf-reader (= 2.4.0)
       ruby-readability (>= 0.7)
       sanitize (>= 5.0.0)
       selenium-webdriver (= 3.142.7)
@@ -14,8 +15,10 @@ PATH
 GEM
   remote: https://rubygems.org/
   specs:
+    Ascii85 (1.0.3)
     addressable (2.7.0)
       public_suffix (>= 2.0.2, < 5.0)
+    afm (0.2.2)
     byebug (11.1.3)
     childprocess (3.0.0)
     cld (0.8.0)
@@ -31,6 +34,7 @@ GEM
     ffi (1.13.1)
     guess_html_encoding (0.0.11)
     hashdiff (1.0.1)
+    hashery (2.1.2)
     http-cookie (1.0.3)
       domain_name (~> 0.5)
     mechanize (2.7.6)
@@ -52,11 +56,17 @@ GEM
     net-http-digest_auth (1.4.1)
     net-http-persistent (4.0.0)
       connection_pool (~> 2.2)
-    nokogiri (1.10.9)
+    nokogiri (1.10.10)
       mini_portile2 (~> 2.4.0)
     nokogumbo (2.0.2)
       nokogiri (~> 1.8, >= 1.8.4)
     ntlm-http (0.1.1)
+    pdf-reader (2.4.0)
+      Ascii85 (~> 1.0.0)
+      afm (~> 0.2.1)
+      hashery (~> 2.0)
+      ruby-rc4
+      ttfunk
     pry (0.13.1)
       coderay (~> 1.1)
       method_source (~> 1.0)
@@ -78,18 +88,20 @@ GEM
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.9.0)
     rspec-support (3.9.3)
+    ruby-rc4 (0.1.5)
     ruby-readability (0.7.0)
       guess_html_encoding (>= 0.0.4)
       nokogiri (>= 1.6.0)
     rubyzip (2.3.0)
     safe_yaml (1.0.5)
-    sanitize (5.2.0)
+    sanitize (5.2.1)
       crass (~> 1.0.2)
       nokogiri (>= 1.8.0)
       nokogumbo (~> 2.0)
     selenium-webdriver (3.142.7)
       childprocess (>= 0.5, < 4.0)
       rubyzip (>= 1.2.2)
+    ttfunk (1.6.2.1)
     unf (0.1.4)
       unf_ext
     unf_ext (0.0.7.7)
@@ -108,7 +120,7 @@ DEPENDENCIES
   rake (>= 10.0)
   rspec (>= 3.0)
   web_stat!
-  webmock (>= 3.6.0)
+  webmock (>= 3.8.3)
 BUNDLED WITH
    2.1.4

data/lib/web_stat.rb CHANGED

@@ -8,6 +8,7 @@ require 'sanitize'
 require 'nokogiri'
 require 'open-uri'
 require 'net/http'
+require 'pdf/reader'
 require 'ruby-readability'
 require 'selenium-webdriver'

data/lib/web_stat/config/web_stat.yml CHANGED

@@ -1,7 +1,7 @@
 development: &development
   # Minimum number of characters to detect meta title
   min_length_of_meta_title: 10
-  # Split regular expression for titles
+  # Split regular expression for titles
   regex_to_sprit_title: '\||-|:|｜|：|〜|\~| – '
   # User Agent
   user_agent: "web_stat gem agent"
@@ -14,6 +14,10 @@ development: &development
     - '//img/@src'
   userdic: ""
   use_chromedirver: false
+  thumbnail_regex:
+    youtube:
+      - '%r{^https://www.youtube.com/watch\?v=([^&]+)}'
+      - 'http://img.youtube.com/vi/\1/default.jpg'
 test:
   <<: *development
 production:

data/lib/web_stat/fetch.rb CHANGED

@@ -1,7 +1,6 @@
 module WebStat
   class Fetch
     attr_accessor :url, :html, :nokogiri, :userdic, :status
     # Get title
     # @return [String] title
     def title
@@ -19,7 +18,8 @@ module WebStat
         title.strip
       end
     end
-    # Get name of domain
+    # Get name of domain
     def site_name
       begin
         site_name = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).last
@@ -36,7 +36,7 @@ module WebStat
     def content
       Sanitize.clean(Readability::Document.new(@nokogiri.at('body')).content)
     end
     # Get temporary path of image
     def eyecatch_image_path
       # Reuse `path` in this method
@@ -47,9 +47,15 @@ module WebStat
           break
         end
       end
+      # If there is a thumbnail rule, apply it.
+      WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
+        if @url.match(v[0])
+          return @url.gsub(v[0], v[1])
+        end
+      end
       readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body')).content)
       if (path.nil? || path.empty?) && readability_content.xpath('//img').first
-        path = readability_content.xpath('//img').first.attr('src')
+        path =  readability_content.xpath('//img').first.attr('src')
       end
       if (path.nil? || path.empty?) && @nokogiri.xpath('//img').first
         path = @nokogiri.xpath('//img').first.attr('src')
@@ -60,7 +66,7 @@ module WebStat
         path
       end
     end
     # Get local path to save url
     # @param [String] url
     def save_local_path(url)
@@ -71,13 +77,13 @@ module WebStat
       File.open(tmp_file, "w+b") do |_file|
         if image.class == Mechanize::File
           _file.puts(image.body)
-        else
+        elsif image.respond_to?(:body_io)
           _file.puts(image.body_io.read)
         end
       end
       tmp_file
     end
     # Get url
     # @param [String] url
     # @param [String] body
@@ -107,7 +113,7 @@ module WebStat
       end
       body
     end
     # Get the informations of @url
     # @param [Hash] Specify a dictionary for each language code. example ) {"ja": /***/**.dic, "other": /***/***.dic}
     def stat(userdics: nil)

data/lib/web_stat/fetch/fetch_as_web.rb CHANGED

@@ -9,7 +9,31 @@ module WebStat
         raise WebStat::INVALID_URL, url
       end
       @url = original_url(url)
-      @html = get_url(@url)
+      if @url.match?(/\.pdf$/)
+        title = nil
+        body = nil
+        URI.open(@url) do |io|
+          reader = PDF::Reader.new(io)
+          if reader.info.key?(:Title)
+            title = reader.info[:Title]
+          else
+            title = File.basename(@url, ".pdf")
+          end
+          body = reader.pages.first.text
+        end
+        @html = <<-"EOS"
+          <html>
+          <head>
+            <title>#{title}</title>
+          </head>
+          <body>
+            #{body}
+          </body>
+          </html>
+        EOS
+      else
+        @html = get_url(@url)
+      end
       @nokogiri = ::Nokogiri::HTML(@html)
     end

data/lib/web_stat/version.rb CHANGED

@@ -1,3 +1,3 @@
 module WebStat
-  VERSION = "0.3.14"
+  VERSION = "0.3.19"
 end

data/spec/fixtures/pdfs/newsdict.blog.pdf ADDED

Binary file

data/spec/fixtures/pdfs/rfc2616.pdf ADDED

Binary file

data/spec/spec_helper.rb CHANGED

@@ -6,8 +6,9 @@ require "web_stat"
 require 'webmock'
 include WebMock::API
+ENV['ENV'] = 'test'
 WebMock.enable!
 RSpec.configure do |config|
   # Enable flags like --only-failures and --next-failure
   config.example_status_persistence_file_path = ".rspec_status"
@@ -52,7 +53,14 @@ module WebStatTestHelper
     # Get htmls of fixture
     def scheme_and_files
       Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "htmls", "*.html")).map do |file|
-	"https://newsdict.blog/#{File.basename(file)}"
+	      "https://newsdict.blog/#{File.basename(file)}"
+      end
+    end
+    # Get pdfs of fixture
+    def pdfs
+      Dir.glob(File.join(File.dirname(__FILE__), "fixtures", "pdfs", "*.pdf")).map do |file|
+	      "https://newsdict.blog/#{File.basename(file)}"
       end
     end
   end
@@ -67,6 +75,13 @@ WebStatTestHelper.scheme_and_files.each do |url|
       body: File.new(File.join(File.dirname(__FILE__), "fixtures", "htmls", File.basename(url))),
       headers: {content_type: 'application/html; charset=utf-8'})
 end
+WebStatTestHelper.pdfs.each do |url|
+  WebMock.stub_request(:get, url)
+    .to_return(
+      status: 200,
+      body: File.read(File.join(File.dirname(__FILE__), "fixtures", "pdfs", File.basename(url))),
+      headers: {content_type: 'application/pdf'})
+end
 WebMock.stub_request(:get, "https://newsdict.blog/robots.txt")
     .to_return(

data/spec/web_stat/configure_spec.rb CHANGED

@@ -3,11 +3,26 @@ RSpec.describe WebStat::Configure do
     configure = WebStat::Configure.get
     expect(configure).not_to eq nil
   end
   it "Readable Config" do
     config = WebStat::Configure.get
     expect(config["min_length_of_meta_title"]).to eq 10
     expect(config["regex_to_sprit_title"]).to eq '\||-|:|｜|：|〜|\~| – '
   end
+  it "Get thumbnail_regex.youtube." do
+    config = WebStat::Configure.get
+    expect(config["thumbnail_regex"]["yotube"].nil?).to eq true
+    expect(config["thumbnail_regex"]["youtube"].count).to eq 2
+  end
+  it "Match youtube url." do
+    sample_url = "https://www.youtube.com/watch?v=aChpsuUffUM"
+    WebStat::Configure.get["thumbnail_regex"].each do |provider, v|
+      if sample_url.match(v[0])
+        expect(sample_url.gsub(v[0], v[1])).to eq 'http://img.youtube.com/vi/aChpsuUffUM/default.jpg'
+      end
+    end
+  end
 end

data/spec/web_stat/fetch_spec.rb CHANGED

@@ -66,6 +66,106 @@ RSpec.describe WebStat::Fetch do
     end
   end
+  [{fixture: "https://newsdict.blog/rfc2616.pdf", class: WebStat::FetchAsWeb}].each do |fetch|
+    it "Get title by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(web_stat.title).to eq "Microsoft Word"
+    end
+    it "Get site name by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(web_stat.site_name).to eq "RFC2616.doc"
+    end
+    it "Get Document's content by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(web_stat.content).not_to eq nil
+    end
+    it "WebStat content do not include html by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(Sanitize.clean(web_stat.content).length).to eq web_stat.content.length
+    end
+    it "Get eyecatch image blob  by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      unless web_stat.stat[:eyecatch_image_path].nil?
+        image = File.read(web_stat.stat[:eyecatch_image_path])
+        expect(image.encoding.to_s).to eq("UTF-8")
+      end
+    end
+    it "Get eyecatch image path by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      expect(web_stat.eyecatch_image_path).to be_string_or_nil
+    end
+    it "Get language_iso by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      expect(web_stat.stat[:language_code]).to eq("en")
+    end
+    it "Get local path of eyecatch image by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      expect(web_stat.stat[:eyecatch_image_path]).to be_tmp_file_or_nil
+    end
+  end
+  [{fixture: "https://newsdict.blog/newsdict.blog.pdf", class: WebStat::FetchAsWeb}].each do |fetch|
+    it "Get title by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(web_stat.title).to eq "newsdict.blog"
+    end
+    it "Get site name by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(web_stat.site_name).to eq "newsdict.blog"
+    end
+    it "Get Document's content by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(web_stat.content).not_to eq nil
+    end
+    it "WebStat content do not include html by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      expect(Sanitize.clean(web_stat.content).length).to eq web_stat.content.length
+    end
+    it "Get eyecatch image blob  by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      unless web_stat.stat[:eyecatch_image_path].nil?
+        image = File.read(web_stat.stat[:eyecatch_image_path])
+        expect(image.encoding.to_s).to eq("UTF-8")
+      end
+    end
+    it "Get eyecatch image path by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      expect(web_stat.eyecatch_image_path).to be_string_or_nil
+    end
+    it "Get language_iso by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      expect(web_stat.stat[:language_code]).to eq("ja")
+    end
+    it "Get local path of eyecatch image by #{fetch[:class].to_s}" do
+      web_stat = fetch[:class].new(fetch[:fixture])
+      web_stat.url = "https://newsdict.blog"
+      expect(web_stat.stat[:eyecatch_image_path]).to be_tmp_file_or_nil
+    end
+  end
   it "WebStat.stat_by_html" do
     WebStatTestHelper.htmls.each do |fixture|
       web_stat = WebStat.stat_by_html(fixture, "https://newsdict.blog")
@@ -104,4 +204,4 @@ RSpec.describe WebStat::Fetch do
     expect(web_stat_fetch_web_class.url_valid?("https://gxyt4.app.goo.gl/Mn64U")).to be true
     expect(web_stat_fetch_web_class.url_valid?("https://status.cloud.google.com/incident/cloud-functions/19010")).to be true
   end
-end
+end

data/web_stat.gemspec CHANGED

@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
   spec.add_runtime_dependency "sanitize", ">= 5.0.0"
   spec.add_runtime_dependency "cld", ">= 0.8.0"
   spec.add_runtime_dependency "selenium-webdriver", "= 3.142.7"
+  spec.add_runtime_dependency "pdf-reader", "2.4.0"
   spec.add_development_dependency "rake", ">= 10.0"
   spec.add_development_dependency "rspec", ">= 3.0"
   spec.add_development_dependency "pry", ">= 0.13.1"
-  spec.add_development_dependency "webmock", ">= 3.6.0"
+  spec.add_development_dependency "webmock", ">= 3.8.3"
   spec.add_development_dependency "pry-byebug", "3.9.0"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: web_stat
 version: !ruby/object:Gem::Version
-  version: 0.3.14
+  version: 0.3.19
 platform: ruby
 authors:
 - yusuke abe
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-06-21 00:00:00.000000000 Z
+date: 2020-11-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -122,6 +122,20 @@ dependencies:
     - - '='
       - !ruby/object:Gem::Version
         version: 3.142.7
+- !ruby/object:Gem::Dependency
+  name: pdf-reader
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 2.4.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 2.4.0
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
@@ -170,14 +184,14 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 3.6.0
+        version: 3.8.3
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 3.6.0
+        version: 3.8.3
 - !ruby/object:Gem::Dependency
   name: pry-byebug
   requirement: !ruby/object:Gem::Requirement
@@ -233,6 +247,8 @@ files:
 - spec/fixtures/htmls/image.html
 - spec/fixtures/images/facebook-3.jpg
 - spec/fixtures/images/newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png
+- spec/fixtures/pdfs/newsdict.blog.pdf
+- spec/fixtures/pdfs/rfc2616.pdf
 - spec/spec_helper.rb
 - spec/web_stat/configure_spec.rb
 - spec/web_stat/fetch_spec.rb
@@ -268,6 +284,8 @@ test_files:
 - spec/fixtures/htmls/image.html
 - spec/fixtures/images/facebook-3.jpg
 - spec/fixtures/images/newsdict-5d8601394c3f4eea2d7161ab92ab327ac7099e22214c853327011b3a71859b8e.png
+- spec/fixtures/pdfs/newsdict.blog.pdf
+- spec/fixtures/pdfs/rfc2616.pdf
 - spec/spec_helper.rb
 - spec/web_stat/configure_spec.rb
 - spec/web_stat/fetch_spec.rb