RubyGems - logstash-filter-ezproxy - Versions diffs - 0.1.0 → 0.1.1 - Mend

logstash-filter-ezproxy 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +5 -5
data/lib/logstash/filters/dawsonera.rb +3 -9
data/lib/logstash/filters/ebscohost.rb +1 -10
data/lib/logstash/filters/emerald.rb +2 -10
data/lib/logstash/filters/ezproxy.rb +46 -23
data/lib/logstash/filters/ft.rb +57 -0
data/lib/logstash/filters/gale.rb +77 -0
data/lib/logstash/filters/ieee.rb +74 -0
data/lib/logstash/filters/jstor.rb +8 -18
data/lib/logstash/filters/lexisnexis.rb +14 -19
data/lib/logstash/filters/sage.rb +2 -5
data/lib/logstash/filters/sciencedirect.rb +3 -10
data/lib/logstash/filters/springer.rb +142 -0
data/lib/logstash/filters/tandf.rb +1 -9
data/lib/logstash/filters/wiley.rb +11 -19
data/lib/logstash/helpers/url_parser.rb +23 -0
data/logstash-filter-ezproxy.gemspec +3 -2
data/spec/filters/dawsonera/dawsonera_spec.rb +6 -1
data/spec/filters/ebscohost/ebscohost_spec.rb +7 -1
data/spec/filters/emerald/emerald_spec.rb +6 -1
data/spec/filters/ft/ft.2016-11-22.csv +9 -0
data/spec/filters/ft/ft_spec.rb +21 -0
data/spec/filters/gale/gale.2016-05-11.csv +5 -0
data/spec/filters/gale/gale_spec.rb +22 -0
data/spec/filters/ieee/ieee.2015-04-15.csv +16 -0
data/spec/filters/ieee/ieee_spec.rb +20 -0
data/spec/filters/jstor/jstor_spec.rb +6 -1
data/spec/filters/lexisnexis/lexisnexis_spec.rb +6 -1
data/spec/filters/sage/sage_spec.rb +6 -1
data/spec/filters/sciencedirect/sciencedirect_spec.rb +6 -1
data/spec/filters/springer/springer.2013-01-31.csv +21 -0
data/spec/filters/springer/springer_spec.rb +23 -0
data/spec/filters/tandf/tandf_spec.rb +6 -1
data/spec/filters/wiley/wiley_spec.rb +6 -1
metadata +34 -7

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: d55b67f72348d7a1e6b0afa2fa3df83ba7c09967
-  data.tar.gz: ca1a3ec43e516224aec41ac687f095e052fa53ed
+SHA256:
+  metadata.gz: bcb21d0a739c44738fd8b3a144306a2a102dab41bae584ce309cb5d521c74dc4
+  data.tar.gz: 76afc586013a40045266fdf4fb21be58ff4aa37db1d668d3d5270d3803ed95e2
 SHA512:
-  metadata.gz: 00f3455d7f27aa70148ac3d952b486b4dae1119bed7258bac2431ef7e0959b77ae4410fe231b07d550679f590514d48d6ce22e195b59d4c968bbe58dc01c1cbf
-  data.tar.gz: b510124c44d2df0adddbebe35b8dd418689435652fc972475c0917c254adc14ac785a547b956546fc406e0b4d4343324c4d58929c31c023a1b13ff00c9d3f9b2
+  metadata.gz: 576a44b17415f67d2f70f73f4fda6692077bf30ddc98060730969d5c108cca80f67d31d76615df7507ec0eda4bbff974a48510dec586915ae75ad460fc147174
+  data.tar.gz: 3dac55ab855093a2e9bc8ac881946ccf7036f20b4962a62b5d46566557cb528553ce26337ba2af9ef661dfa19004934f4fa18aff61e4fd63269f5b8347fa2965

data/lib/logstash/filters/dawsonera.rb CHANGED

@@ -3,20 +3,14 @@ require 'uri'
 require 'cgi'
 module DawsonEra
-  def DawsonEra.parse (input)
-    uri = URI(URI.unescape(input))
-    path = uri.path
-    params = {}
-    if (uri.query)
-        params = CGI::parse(uri.query)
-    end
+  def DawsonEra.parse (path, params)
     data = {
         "provider" => "dawsonera"
     }
+    params = {}
     if (match = /^(\/abstract\/([0-9]+))$/.match(path))
       data['rtype']             = 'ABS'
       data['mime']              = 'MISC'

data/lib/logstash/filters/ebscohost.rb CHANGED

@@ -14,20 +14,11 @@ module Ebscohost
     'id' => 'unit_id'
   }
-  def Ebscohost.parse (input)
-    uri = URI(URI.unescape(input))
-    path = uri.path
-    params = {}
-    if (uri.query)
-        params = CGI::parse(uri.query)
-    end
+  def Ebscohost.parse (path, params, uri)
     data = {
         "provider" => "ebscohost"
     }
     if ((match = /^\/(ehost|eds)\/([a-z]+)(?:\/[a-z]+)?$/i.match(path)))
       category = match[2].downcase

data/lib/logstash/filters/emerald.rb CHANGED

@@ -3,18 +3,10 @@ require 'uri'
 require 'cgi'
 module Emerald
-  def Emerald.parse (input)
-    uri = URI(URI.unescape(input))
-    path = uri.path
-    params = {}
-    if (uri.query)
-        params = CGI::parse(uri.query)
-    end
+  def Emerald.parse (path, params)
     data = {
-        "provider" => "emerald"
+      "provider" => "emerald"
     }
     if ((match = /^\/series\/([a-z]+)$/.match(path)))

data/lib/logstash/filters/ezproxy.rb CHANGED

@@ -1,6 +1,7 @@
 # encoding: utf-8
 require "logstash/filters/base"
 require "logstash/namespace"
+require_relative '../helpers/url_parser'
 require_relative "./jstor"
 require_relative "./lexisnexis"
 require_relative "./sage"
@@ -10,6 +11,10 @@ require_relative "./dawsonera"
 require_relative "./tandf"
 require_relative "./emerald"
 require_relative "./ebscohost"
+require_relative "./gale"
+require_relative "./ft"
+require_relative "./springer"
+require_relative "./ieee"
 require 'uri'
 require 'cgi'
@@ -45,9 +50,14 @@ class LogStash::Filters::Ezproxy < LogStash::Filters::Base
   public
   def filter(event)
-    url = event.get(@url)
+    input = URI::extract(event.get(@url))[0]
     data = {}
-    uri = URI(URI::extract(url)[0])
+    parsed_url = URLParser::parse(input)
+    uri = parsed_url['uri']
+    path = parsed_url['path']
+    params = parsed_url['params']
     # if (uri.host == "ezproxy.lancs.ac.uk")
     #   if (uri.query)
@@ -63,28 +73,41 @@ class LogStash::Filters::Ezproxy < LogStash::Filters::Base
     #   end
     # end
-    case
-    when uri.host.include?("www.jstor.org")
-      data = Jstor::parse(uri.to_s)
-    when uri.host.include?("www.lexisnexis.com")
-      data = LexisNexis::parse(uri.to_s)
-    when uri.host.include?("journals.sagepub.com")
-      data = Sage::parse(uri.to_s)
-    when uri.host.include?("onlinelibrary.wiley.com")
-      data = Wiley::parse(uri.to_s)
-    when uri.host.include?("www.sciencedirect.com")
-      data = ScienceDirect::parse(uri.to_s)
-    when uri.host.include?("www.dawsonera.com")
-      data = DawsonEra::parse(uri.to_s)
-    when uri.host.include?("www.tandfonline.com")
-      data = TandF::parse(uri.to_s)
-    when uri.host.include?("www.emeraldinsight.com")
-      data = Emerald::parse(uri.to_s)
-    when uri.host.include?("ebscohost.com")
-      data = Ebscohost::parse(uri.to_s)
+    unless uri == nil
+      case
+      when uri.host.include?("www.jstor.org")
+        data = Jstor::parse(path, params)
+      when uri.host.include?("www.lexisnexis.com")
+        data = LexisNexis::parse(path, params)
+      when uri.host.include?("journals.sagepub.com")
+        data = Sage::parse(path, params)
+      when uri.host.include?("wiley.com")
+        data = Wiley::parse(path, params)
+      when uri.host.include?("www.sciencedirect.com")
+        data = ScienceDirect::parse(path, params)
+      when uri.host.include?("www.dawsonera.com")
+        data = DawsonEra::parse(path, params)
+      when uri.host.include?("www.tandfonline.com")
+        data = TandF::parse(path, params)
+      when uri.host.include?("www.emeraldinsight.com")
+        data = Emerald::parse(path, params)
+      when uri.host.include?("ebscohost.com")
+        data = Ebscohost::parse(path, params, uri)
+      when uri.host.include?("els-cdn.com")
+        data = ScienceDirect::parse(path, params)
+      when uri.host.include?("springer.com")
+        data = Springer::parse(path, params)
+      when uri.host.include?("galegroup.com")
+        data = Gale::parse(path, params)
+      when uri.host.include?("ieee.org")
+        data = IEEE::parse(path, params)
+      end
+      event.set("request_metadata", data)
+      event.tag("ezproxy_parse_success")
+    else
+      event.tag("ezproxy_parse_failure")
     end
-    event.set("request_metadata", data)
-    event.tag("ezproxy_parse_success")
     # filter_matched should go in the last line of our successful code

data/lib/logstash/filters/ft.rb ADDED

@@ -0,0 +1,57 @@
+require "uri"
+module FT
+  def FT.parse (path, params)
+    data = {
+      "provider" => "ft"
+    }
+    if ((match = /^\/cms\/([a-z]+)\/([0-9]+)\/([0-9a-z-]+).html$/i.match(path)))
+      data['rtype']    = 'ARTICLE'
+      data['mime']     = 'HTML'
+      data['unit_id'] = match[3]
+    elsif ((match = /^\/([0-9]+)\/([a-z-]+)\/([a-z-]+)$/i.match(path)))
+      data['rtype']    = 'VIDEO'
+      data['mime']     = 'MISC'
+      data['title_id'] = match[1]
+      data['unit_id']   = match[2]
+    elsif ((match = /^\/([0-9]{4})\/([0-9]{2})\/([0-9]{2})\/([0-9]+)\/([a-z-]+)\/?$/i.match(path)))
+      data['rtype']    = 'ARTICLE'
+      data['mime']     = 'HTML'
+      data['title_id'] = match[4]
+      data['unit_id']   = match[5]
+      data['publication_date'] = match[1]
+    elsif ((match = /^\/Olive\/([A-Z]+)\/([a-zA-Z]+)\/?$/i.match(path)))
+      data['rtype']    = 'ARTICLE'
+      data['mime']     = 'HTML'
+    elsif ((match = /^\/data\/([a-z]+)\/dashboard$/i.match(path)))
+      data['rtype']    = 'TOOL'
+      data['mime']     = 'HTML'
+    elsif ((match = /^\/reports\/([a-z-]+)$/i.match(path)))
+      data['rtype']    = 'REPORT'
+      data['mime']     = 'HTML'
+      data['unit_id']   = match[1]
+    elsif ((match = /^\/content\/([a-z]+)\/([0-9a-z-]+).pdf$/i.match(path)))
+      data['rtype']    = 'REPORT'
+      data['mime']     = 'PDF'
+      data['unit_id']   = match[2]
+    elsif ((match = /^\/businessschoolrankings\/([a-z-]+)\/(([a-z-]+)([0-9]+))$/i.match(path)))
+      data['rtype']    = 'TOOL'
+      data['mime']     = 'HTML'
+      data['unit_id']   = match[2]
+      data['publication_date'] = match[4]
+    end
+    return data;
+  end
+end

data/lib/logstash/filters/gale.rb ADDED

@@ -0,0 +1,77 @@
+require 'uri'
+require 'cgi'
+module Gale
+  def Gale.parse (path, params)
+    data = {
+      "provider" => "gale"
+    }
+    if ((match = /^\/ps\/([a-zA-z]+).do$/i.match(path)))
+      data['rtype'] = 'ENCYCLOPAEDIA_ENTRY'
+      data['mime']  = 'HTML'
+      if (/[\w]Toc/.match(match[1]))
+        data['rtype'] = 'TOC'
+      end
+      if (params.key?('docId'))
+        data['title_id'] = params['docId'][0]
+        data['unit_id']   = params['docId'][0] + '_' + ( params['contentSegment'][0] || "")
+      end
+      if (params.key?('workId') && /[\w\W]pdf/.match(params['workId'][0]))
+        data['mime']   = 'PDF'
+        data['unit_id'] = params['docId'][0] + '_' + params['workId'][0].split('|')[0]
+      end
+    elsif (/^\/cgi-bin\/([a-z]+)$/i.match(path))
+      data['rtype']    = 'ENCYCLOPAEDIA_ENTRY'
+      data['mime']     = 'MISC'
+      if (params.key?('docId') && params.key?('contentSegment'))
+        data['title_id'] = params['docId'][0]
+        data['unit_id']   = params['docId'][0] + '_' + params['contentSegment'][0]
+      end
+    elsif (/^\/ps\/pdfViewer$/i.match(path))
+      data['rtype']  = 'BOOK'
+      data['mime']   = 'PDF'
+      data['unit_id'] = param.docId
+    elsif (/^\/gdc-artemis\/bulkPdfDownload$/i.match(path))
+      data['rtype']  = 'ARTICLE'
+      data['mime']   = 'PDF'
+      data['unit_id'] = param.file_name
+    elsif (/^\/gdc\/artemis\/ManuscriptsDetailsPage\/ManuscriptsDetailsWindow$/i.match(path))
+      data['rtype']  = 'ARTICLE'
+      data['mime']   = 'HTML'
+      data['unit_id'] = param.documentId
+    elsif (/^\/gdsc\/retrieve.do$/i.match(path))
+      data['rtype']  = 'ARTICLE'
+      data['mime']   = 'HTML'
+      data['unit_id'] = param.contentSet
+    elsif (/^\/gdsc\/downloadDocument.do$/i.match(path))
+      data['rtype']  = 'ARTICLE'
+      data['mime']   = 'PDF'
+      data['unit_id'] = param.docId
+    elsif (/^\/([a-z]+)\/([a-z]+)\/MonographsDetailsPage\/MonographsDetailsWindow$/i.match(path))
+      data['rtype']  = 'ARTICLE'
+      data['mime']   = 'HTML'
+      data['unit_id'] = param.documentId
+    elsif (/^\/([a-z]+)\/archive\/FeatureArticlesDetailsPage\/FeatureArticlesDetailsWindow$/i.match(path))
+      data['rtype']  = 'ENCYCLOPAEDIA_ENTRY'
+      data['mime']   = 'HTML'
+      data['unit_id'] = param.documentId
+    end
+    return data
+  end
+end

data/lib/logstash/filters/ieee.rb ADDED

@@ -0,0 +1,74 @@
+require 'uri'
+require 'cgi'
+module IEEE
+  def IEEE.parse (path, params)
+    data = {
+      "provider" => "springer"
+    }
+    if (/^\/xpl\/(([a-zA-Z]+)\.jsp)/.match(path))
+      if (params.key?('punumber'))
+        data['rtype']    = 'TOC'
+        data['mime']     = 'HTML'
+        data['title_id'] = params['punumber'][0]
+        data['unit_id']   = params['punumber'][0]
+      elsif (params.key?('arnumber'))
+        data['rtype']    = 'ABS'
+        data['mime']     = 'HTML'
+        data['title_id'] = params['arnumber'][0]
+        data['unit_id']   = params['arnumber'][0]
+      elsif (params.key?('bkn'))
+        data['rtype']    = 'TOC'
+        data['mime']     = 'HTML'
+        data['title_id'] = params['bkn'][0]
+        data['unit_id'] = params['bkn'][0]
+      end
+    elsif (/^\/xpls\/(([a-z]+)\.jsp)/.match(path))
+      data['rtype'] = 'ARTICLE'
+      data['mime']  = 'HTML'
+      if (params.key?('arnumber'))
+        data['title_id'] = params['arnumber'][0]
+        data['unit_id']   = params['arnumber'][0]
+      end
+    elsif (/^\/stamp\/(([a-z]+)\.jsp)/.match(path))
+      data['rtype']    = 'ARTICLE'
+      data['mime']     = 'PDF'
+      if (params.key?('arnumber'))
+        data['title_id'] = params['arnumber'][0]
+        data['unit_id']   = params['arnumber'][0]
+      end
+    elsif ((match = /^\/ielx7\/([0-9]+)\/([0-9]+)\/([0-9]+)\.pdf/.match(path)))
+      data['rtype']    = 'ARTICLE'
+      data['mime']     = 'PDF'
+      data['title_id'] = match[2]
+      data['unit_id']   = match[3]
+    elsif ((match = /^\/stampPDF\/(([a-zA-Z]+)\.jsp)/.match(path)))
+      data['rtype']    = 'ARTICLE'
+      data['mime']     = 'PDF'
+      if (params.key?('arnumber'))
+        data['title_id'] = params['arnumber'][0]
+        data['unit_id']   = params['arnumber'][0]
+      end
+    elsif ((match = /^\/courses\/([a-z]+)\/([A-Z0-9]+)\/([a-z]+)\/([a-z]+)/.match(path)))
+      data['rtype']    = 'ONLINE_COURSE'
+      data['mime']     = 'FLASH'
+      data['unit_id']   = match[2]
+    elsif ((match = /^\/courses\/([a-z]+)\/([A-Z0-9]+)/.match(path)))
+      data['rtype']    = 'ABS'
+      data['mime']     = 'MISC'
+      data['unit_id']   = match[2]
+    end
+    return data
+  end
+end

data/lib/logstash/filters/jstor.rb CHANGED

@@ -1,16 +1,6 @@
-require 'uri'
-require 'cgi'
 module Jstor
-    def Jstor.parse (input)
-        uri = URI(URI.unescape(input))
-        url = uri.path
-        params = {}
-        if (uri.query)
-            params = CGI::parse(uri.query)
-        end
+    def Jstor.parse (path, params)
         data = {
             "provider" => "jstor"
@@ -18,13 +8,13 @@ module Jstor
         doi_prefix = "10.2307"
-        if (match = /^\/journal\/([a-z0-9]+)$/i.match(url))
+        if (match = /^\/journal\/([a-z0-9]+)$/i.match(path))
             data["rtype"] = "TOC"
             data["mime"] = "MISC"
             data["unit_id"] = match[1]
             data["title_id"] = match[1]
-        elsif (match = /^\/stable\/10\.[0-9]+\/(([a-z]+)\.([0-9]+)\.([0-9]+)\.issue-([0-9]+))$/i.match(url))
+        elsif (match = /^\/stable\/10\.[0-9]+\/(([a-z]+)\.([0-9]+)\.([0-9]+)\.issue-([0-9]+))$/i.match(path))
             data["rtype"] = "TOC"
             data["mime"] = "MISC"
             data["unit_id"] = match[1]
@@ -38,7 +28,7 @@ module Jstor
                 data["vol"] = match[3]
             end
-        elsif (match = /^\/stable\/((10\.[0-9]+\/)?([a-z0-9]+))$/i.match(url))
+        elsif (match = /^\/stable\/((10\.[0-9]+\/)?([a-z0-9]+))$/i.match(path))
             data["rtype"] = "TOC"
             data["mime"] = "MISC"
             data["unit_id"] = match[3]
@@ -48,13 +38,13 @@ module Jstor
                 data["doi"] = match[1]
             end
-        elsif (match =  /^\/stable\/(i[0-9]+)$/i.match(url))
+        elsif (match =  /^\/stable\/(i[0-9]+)$/i.match(path))
             data["rtype"] = "TOC"
             data["mime"] = "MISC"
             data["unit_id"] = match[1]
             data["title_id"] = match[1]
-        elsif (/^\/action\/showPublication$/i.match(url))
+        elsif (/^\/action\/showPublication$/i.match(path))
             if (params["journalCode"])
                 data["title_id"] = params["journalCode"][0]
                 data["unit_id"] = params["journalCode"][0]
@@ -62,7 +52,7 @@ module Jstor
                 data["mime"] = 'MISC'
             end
-        elsif (match =  /^\/stable\/(get_image|pdf|pdfplus)\/((10\.[0-9]+\/)?([a-z0-9.]+?))(?:\.pdf)?$/i.match(url))
+        elsif (match =  /^\/stable\/(get_image|pdf|pdfplus)\/((10\.[0-9]+\/)?([a-z0-9.]+?))(?:\.pdf)?$/i.match(path))
             data["unit_id"] = match[4]
             data["doi"] = match[3] ? match[2] : doi_prefix + "/" + match[2]
@@ -99,7 +89,7 @@ module Jstor
                 end
             end
-        elsif (match = /^\/stable\/(info|view)\/([0-9]+)$/i.match(url))
+        elsif (match = /^\/stable\/(info|view)\/([0-9]+)$/i.match(path))
             data["rtype"] = match[1] === 'info' ? "ABS" : "PREVIEW"
             data["mime"] = "MISC"
             data["unit_id"] = match[2]