logstash-filter-ezproxy 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +5 -5
  2. data/lib/logstash/filters/dawsonera.rb +3 -9
  3. data/lib/logstash/filters/ebscohost.rb +1 -10
  4. data/lib/logstash/filters/emerald.rb +2 -10
  5. data/lib/logstash/filters/ezproxy.rb +46 -23
  6. data/lib/logstash/filters/ft.rb +57 -0
  7. data/lib/logstash/filters/gale.rb +77 -0
  8. data/lib/logstash/filters/ieee.rb +74 -0
  9. data/lib/logstash/filters/jstor.rb +8 -18
  10. data/lib/logstash/filters/lexisnexis.rb +14 -19
  11. data/lib/logstash/filters/sage.rb +2 -5
  12. data/lib/logstash/filters/sciencedirect.rb +3 -10
  13. data/lib/logstash/filters/springer.rb +142 -0
  14. data/lib/logstash/filters/tandf.rb +1 -9
  15. data/lib/logstash/filters/wiley.rb +11 -19
  16. data/lib/logstash/helpers/url_parser.rb +23 -0
  17. data/logstash-filter-ezproxy.gemspec +3 -2
  18. data/spec/filters/dawsonera/dawsonera_spec.rb +6 -1
  19. data/spec/filters/ebscohost/ebscohost_spec.rb +7 -1
  20. data/spec/filters/emerald/emerald_spec.rb +6 -1
  21. data/spec/filters/ft/ft.2016-11-22.csv +9 -0
  22. data/spec/filters/ft/ft_spec.rb +21 -0
  23. data/spec/filters/gale/gale.2016-05-11.csv +5 -0
  24. data/spec/filters/gale/gale_spec.rb +22 -0
  25. data/spec/filters/ieee/ieee.2015-04-15.csv +16 -0
  26. data/spec/filters/ieee/ieee_spec.rb +20 -0
  27. data/spec/filters/jstor/jstor_spec.rb +6 -1
  28. data/spec/filters/lexisnexis/lexisnexis_spec.rb +6 -1
  29. data/spec/filters/sage/sage_spec.rb +6 -1
  30. data/spec/filters/sciencedirect/sciencedirect_spec.rb +6 -1
  31. data/spec/filters/springer/springer.2013-01-31.csv +21 -0
  32. data/spec/filters/springer/springer_spec.rb +23 -0
  33. data/spec/filters/tandf/tandf_spec.rb +6 -1
  34. data/spec/filters/wiley/wiley_spec.rb +6 -1
  35. metadata +34 -7
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: d55b67f72348d7a1e6b0afa2fa3df83ba7c09967
4
- data.tar.gz: ca1a3ec43e516224aec41ac687f095e052fa53ed
2
+ SHA256:
3
+ metadata.gz: bcb21d0a739c44738fd8b3a144306a2a102dab41bae584ce309cb5d521c74dc4
4
+ data.tar.gz: 76afc586013a40045266fdf4fb21be58ff4aa37db1d668d3d5270d3803ed95e2
5
5
  SHA512:
6
- metadata.gz: 00f3455d7f27aa70148ac3d952b486b4dae1119bed7258bac2431ef7e0959b77ae4410fe231b07d550679f590514d48d6ce22e195b59d4c968bbe58dc01c1cbf
7
- data.tar.gz: b510124c44d2df0adddbebe35b8dd418689435652fc972475c0917c254adc14ac785a547b956546fc406e0b4d4343324c4d58929c31c023a1b13ff00c9d3f9b2
6
+ metadata.gz: 576a44b17415f67d2f70f73f4fda6692077bf30ddc98060730969d5c108cca80f67d31d76615df7507ec0eda4bbff974a48510dec586915ae75ad460fc147174
7
+ data.tar.gz: 3dac55ab855093a2e9bc8ac881946ccf7036f20b4962a62b5d46566557cb528553ce26337ba2af9ef661dfa19004934f4fa18aff61e4fd63269f5b8347fa2965
@@ -3,20 +3,14 @@ require 'uri'
3
3
  require 'cgi'
4
4
 
5
5
  module DawsonEra
6
- def DawsonEra.parse (input)
7
-
8
- uri = URI(URI.unescape(input))
9
-
10
- path = uri.path
11
- params = {}
12
- if (uri.query)
13
- params = CGI::parse(uri.query)
14
- end
6
+ def DawsonEra.parse (path, params)
15
7
 
16
8
  data = {
17
9
  "provider" => "dawsonera"
18
10
  }
19
11
 
12
+ params = {}
13
+
20
14
  if (match = /^(\/abstract\/([0-9]+))$/.match(path))
21
15
  data['rtype'] = 'ABS'
22
16
  data['mime'] = 'MISC'
@@ -14,20 +14,11 @@ module Ebscohost
14
14
  'id' => 'unit_id'
15
15
  }
16
16
 
17
- def Ebscohost.parse (input)
18
-
19
- uri = URI(URI.unescape(input))
20
-
21
- path = uri.path
22
- params = {}
23
- if (uri.query)
24
- params = CGI::parse(uri.query)
25
- end
17
+ def Ebscohost.parse (path, params, uri)
26
18
 
27
19
  data = {
28
20
  "provider" => "ebscohost"
29
21
  }
30
-
31
22
 
32
23
  if ((match = /^\/(ehost|eds)\/([a-z]+)(?:\/[a-z]+)?$/i.match(path)))
33
24
  category = match[2].downcase
@@ -3,18 +3,10 @@ require 'uri'
3
3
  require 'cgi'
4
4
 
5
5
  module Emerald
6
- def Emerald.parse (input)
7
-
8
- uri = URI(URI.unescape(input))
9
-
10
- path = uri.path
11
- params = {}
12
- if (uri.query)
13
- params = CGI::parse(uri.query)
14
- end
6
+ def Emerald.parse (path, params)
15
7
 
16
8
  data = {
17
- "provider" => "emerald"
9
+ "provider" => "emerald"
18
10
  }
19
11
 
20
12
  if ((match = /^\/series\/([a-z]+)$/.match(path)))
@@ -1,6 +1,7 @@
1
1
  # encoding: utf-8
2
2
  require "logstash/filters/base"
3
3
  require "logstash/namespace"
4
+ require_relative '../helpers/url_parser'
4
5
  require_relative "./jstor"
5
6
  require_relative "./lexisnexis"
6
7
  require_relative "./sage"
@@ -10,6 +11,10 @@ require_relative "./dawsonera"
10
11
  require_relative "./tandf"
11
12
  require_relative "./emerald"
12
13
  require_relative "./ebscohost"
14
+ require_relative "./gale"
15
+ require_relative "./ft"
16
+ require_relative "./springer"
17
+ require_relative "./ieee"
13
18
  require 'uri'
14
19
  require 'cgi'
15
20
 
@@ -45,9 +50,14 @@ class LogStash::Filters::Ezproxy < LogStash::Filters::Base
45
50
 
46
51
  public
47
52
  def filter(event)
48
- url = event.get(@url)
53
+ input = URI::extract(event.get(@url))[0]
54
+
49
55
  data = {}
50
- uri = URI(URI::extract(url)[0])
56
+
57
+ parsed_url = URLParser::parse(input)
58
+ uri = parsed_url['uri']
59
+ path = parsed_url['path']
60
+ params = parsed_url['params']
51
61
 
52
62
  # if (uri.host == "ezproxy.lancs.ac.uk")
53
63
  # if (uri.query)
@@ -63,28 +73,41 @@ class LogStash::Filters::Ezproxy < LogStash::Filters::Base
63
73
  # end
64
74
  # end
65
75
 
66
- case
67
- when uri.host.include?("www.jstor.org")
68
- data = Jstor::parse(uri.to_s)
69
- when uri.host.include?("www.lexisnexis.com")
70
- data = LexisNexis::parse(uri.to_s)
71
- when uri.host.include?("journals.sagepub.com")
72
- data = Sage::parse(uri.to_s)
73
- when uri.host.include?("onlinelibrary.wiley.com")
74
- data = Wiley::parse(uri.to_s)
75
- when uri.host.include?("www.sciencedirect.com")
76
- data = ScienceDirect::parse(uri.to_s)
77
- when uri.host.include?("www.dawsonera.com")
78
- data = DawsonEra::parse(uri.to_s)
79
- when uri.host.include?("www.tandfonline.com")
80
- data = TandF::parse(uri.to_s)
81
- when uri.host.include?("www.emeraldinsight.com")
82
- data = Emerald::parse(uri.to_s)
83
- when uri.host.include?("ebscohost.com")
84
- data = Ebscohost::parse(uri.to_s)
76
+ unless uri == nil
77
+
78
+ case
79
+ when uri.host.include?("www.jstor.org")
80
+ data = Jstor::parse(path, params)
81
+ when uri.host.include?("www.lexisnexis.com")
82
+ data = LexisNexis::parse(path, params)
83
+ when uri.host.include?("journals.sagepub.com")
84
+ data = Sage::parse(path, params)
85
+ when uri.host.include?("wiley.com")
86
+ data = Wiley::parse(path, params)
87
+ when uri.host.include?("www.sciencedirect.com")
88
+ data = ScienceDirect::parse(path, params)
89
+ when uri.host.include?("www.dawsonera.com")
90
+ data = DawsonEra::parse(path, params)
91
+ when uri.host.include?("www.tandfonline.com")
92
+ data = TandF::parse(path, params)
93
+ when uri.host.include?("www.emeraldinsight.com")
94
+ data = Emerald::parse(path, params)
95
+ when uri.host.include?("ebscohost.com")
96
+ data = Ebscohost::parse(path, params, uri)
97
+ when uri.host.include?("els-cdn.com")
98
+ data = ScienceDirect::parse(path, params)
99
+ when uri.host.include?("springer.com")
100
+ data = Springer::parse(path, params)
101
+ when uri.host.include?("galegroup.com")
102
+ data = Gale::parse(path, params)
103
+ when uri.host.include?("ieee.org")
104
+ data = IEEE::parse(path, params)
105
+ end
106
+ event.set("request_metadata", data)
107
+ event.tag("ezproxy_parse_success")
108
+ else
109
+ event.tag("ezproxy_parse_failure")
85
110
  end
86
- event.set("request_metadata", data)
87
- event.tag("ezproxy_parse_success")
88
111
 
89
112
 
90
113
  # filter_matched should go in the last line of our successful code
@@ -0,0 +1,57 @@
1
+ require "uri"
2
+
3
+ module FT
4
+ def FT.parse (path, params)
5
+
6
+ data = {
7
+ "provider" => "ft"
8
+ }
9
+
10
+ if ((match = /^\/cms\/([a-z]+)\/([0-9]+)\/([0-9a-z-]+).html$/i.match(path)))
11
+ data['rtype'] = 'ARTICLE'
12
+ data['mime'] = 'HTML'
13
+ data['unit_id'] = match[3]
14
+
15
+ elsif ((match = /^\/([0-9]+)\/([a-z-]+)\/([a-z-]+)$/i.match(path)))
16
+ data['rtype'] = 'VIDEO'
17
+ data['mime'] = 'MISC'
18
+ data['title_id'] = match[1]
19
+ data['unit_id'] = match[2]
20
+
21
+ elsif ((match = /^\/([0-9]{4})\/([0-9]{2})\/([0-9]{2})\/([0-9]+)\/([a-z-]+)\/?$/i.match(path)))
22
+ data['rtype'] = 'ARTICLE'
23
+ data['mime'] = 'HTML'
24
+ data['title_id'] = match[4]
25
+ data['unit_id'] = match[5]
26
+ data['publication_date'] = match[1]
27
+
28
+ elsif ((match = /^\/Olive\/([A-Z]+)\/([a-zA-Z]+)\/?$/i.match(path)))
29
+ data['rtype'] = 'ARTICLE'
30
+ data['mime'] = 'HTML'
31
+
32
+ elsif ((match = /^\/data\/([a-z]+)\/dashboard$/i.match(path)))
33
+ data['rtype'] = 'TOOL'
34
+ data['mime'] = 'HTML'
35
+
36
+ elsif ((match = /^\/reports\/([a-z-]+)$/i.match(path)))
37
+ data['rtype'] = 'REPORT'
38
+ data['mime'] = 'HTML'
39
+ data['unit_id'] = match[1]
40
+
41
+ elsif ((match = /^\/content\/([a-z]+)\/([0-9a-z-]+).pdf$/i.match(path)))
42
+ data['rtype'] = 'REPORT'
43
+ data['mime'] = 'PDF'
44
+ data['unit_id'] = match[2]
45
+
46
+ elsif ((match = /^\/businessschoolrankings\/([a-z-]+)\/(([a-z-]+)([0-9]+))$/i.match(path)))
47
+ data['rtype'] = 'TOOL'
48
+ data['mime'] = 'HTML'
49
+ data['unit_id'] = match[2]
50
+ data['publication_date'] = match[4]
51
+
52
+ end
53
+
54
+ return data;
55
+
56
+ end
57
+ end
@@ -0,0 +1,77 @@
1
+ require 'uri'
2
+ require 'cgi'
3
+
4
+ module Gale
5
+ def Gale.parse (path, params)
6
+
7
+ data = {
8
+ "provider" => "gale"
9
+ }
10
+
11
+ if ((match = /^\/ps\/([a-zA-z]+).do$/i.match(path)))
12
+ data['rtype'] = 'ENCYCLOPAEDIA_ENTRY'
13
+ data['mime'] = 'HTML'
14
+
15
+ if (/[\w]Toc/.match(match[1]))
16
+ data['rtype'] = 'TOC'
17
+ end
18
+
19
+ if (params.key?('docId'))
20
+ data['title_id'] = params['docId'][0]
21
+ data['unit_id'] = params['docId'][0] + '_' + ( params['contentSegment'][0] || "")
22
+ end
23
+
24
+ if (params.key?('workId') && /[\w\W]pdf/.match(params['workId'][0]))
25
+ data['mime'] = 'PDF'
26
+ data['unit_id'] = params['docId'][0] + '_' + params['workId'][0].split('|')[0]
27
+ end
28
+
29
+
30
+ elsif (/^\/cgi-bin\/([a-z]+)$/i.match(path))
31
+ data['rtype'] = 'ENCYCLOPAEDIA_ENTRY'
32
+ data['mime'] = 'MISC'
33
+ if (params.key?('docId') && params.key?('contentSegment'))
34
+ data['title_id'] = params['docId'][0]
35
+ data['unit_id'] = params['docId'][0] + '_' + params['contentSegment'][0]
36
+ end
37
+
38
+ elsif (/^\/ps\/pdfViewer$/i.match(path))
39
+ data['rtype'] = 'BOOK'
40
+ data['mime'] = 'PDF'
41
+ data['unit_id'] = param.docId
42
+
43
+ elsif (/^\/gdc-artemis\/bulkPdfDownload$/i.match(path))
44
+ data['rtype'] = 'ARTICLE'
45
+ data['mime'] = 'PDF'
46
+ data['unit_id'] = param.file_name
47
+
48
+ elsif (/^\/gdc\/artemis\/ManuscriptsDetailsPage\/ManuscriptsDetailsWindow$/i.match(path))
49
+ data['rtype'] = 'ARTICLE'
50
+ data['mime'] = 'HTML'
51
+ data['unit_id'] = param.documentId
52
+
53
+ elsif (/^\/gdsc\/retrieve.do$/i.match(path))
54
+ data['rtype'] = 'ARTICLE'
55
+ data['mime'] = 'HTML'
56
+ data['unit_id'] = param.contentSet
57
+
58
+ elsif (/^\/gdsc\/downloadDocument.do$/i.match(path))
59
+ data['rtype'] = 'ARTICLE'
60
+ data['mime'] = 'PDF'
61
+ data['unit_id'] = param.docId
62
+
63
+ elsif (/^\/([a-z]+)\/([a-z]+)\/MonographsDetailsPage\/MonographsDetailsWindow$/i.match(path))
64
+ data['rtype'] = 'ARTICLE'
65
+ data['mime'] = 'HTML'
66
+ data['unit_id'] = param.documentId
67
+
68
+ elsif (/^\/([a-z]+)\/archive\/FeatureArticlesDetailsPage\/FeatureArticlesDetailsWindow$/i.match(path))
69
+ data['rtype'] = 'ENCYCLOPAEDIA_ENTRY'
70
+ data['mime'] = 'HTML'
71
+ data['unit_id'] = param.documentId
72
+ end
73
+
74
+ return data
75
+
76
+ end
77
+ end
@@ -0,0 +1,74 @@
1
+ require 'uri'
2
+ require 'cgi'
3
+
4
+ module IEEE
5
+ def IEEE.parse (path, params)
6
+
7
+ data = {
8
+ "provider" => "springer"
9
+ }
10
+
11
+ if (/^\/xpl\/(([a-zA-Z]+)\.jsp)/.match(path))
12
+
13
+ if (params.key?('punumber'))
14
+ data['rtype'] = 'TOC'
15
+ data['mime'] = 'HTML'
16
+ data['title_id'] = params['punumber'][0]
17
+ data['unit_id'] = params['punumber'][0]
18
+ elsif (params.key?('arnumber'))
19
+ data['rtype'] = 'ABS'
20
+ data['mime'] = 'HTML'
21
+ data['title_id'] = params['arnumber'][0]
22
+ data['unit_id'] = params['arnumber'][0]
23
+ elsif (params.key?('bkn'))
24
+ data['rtype'] = 'TOC'
25
+ data['mime'] = 'HTML'
26
+ data['title_id'] = params['bkn'][0]
27
+ data['unit_id'] = params['bkn'][0]
28
+ end
29
+
30
+ elsif (/^\/xpls\/(([a-z]+)\.jsp)/.match(path))
31
+ data['rtype'] = 'ARTICLE'
32
+ data['mime'] = 'HTML'
33
+
34
+ if (params.key?('arnumber'))
35
+ data['title_id'] = params['arnumber'][0]
36
+ data['unit_id'] = params['arnumber'][0]
37
+ end
38
+
39
+ elsif (/^\/stamp\/(([a-z]+)\.jsp)/.match(path))
40
+ data['rtype'] = 'ARTICLE'
41
+ data['mime'] = 'PDF'
42
+ if (params.key?('arnumber'))
43
+ data['title_id'] = params['arnumber'][0]
44
+ data['unit_id'] = params['arnumber'][0]
45
+ end
46
+
47
+ elsif ((match = /^\/ielx7\/([0-9]+)\/([0-9]+)\/([0-9]+)\.pdf/.match(path)))
48
+ data['rtype'] = 'ARTICLE'
49
+ data['mime'] = 'PDF'
50
+ data['title_id'] = match[2]
51
+ data['unit_id'] = match[3]
52
+
53
+ elsif ((match = /^\/stampPDF\/(([a-zA-Z]+)\.jsp)/.match(path)))
54
+ data['rtype'] = 'ARTICLE'
55
+ data['mime'] = 'PDF'
56
+ if (params.key?('arnumber'))
57
+ data['title_id'] = params['arnumber'][0]
58
+ data['unit_id'] = params['arnumber'][0]
59
+ end
60
+
61
+ elsif ((match = /^\/courses\/([a-z]+)\/([A-Z0-9]+)\/([a-z]+)\/([a-z]+)/.match(path)))
62
+ data['rtype'] = 'ONLINE_COURSE'
63
+ data['mime'] = 'FLASH'
64
+ data['unit_id'] = match[2]
65
+ elsif ((match = /^\/courses\/([a-z]+)\/([A-Z0-9]+)/.match(path)))
66
+ data['rtype'] = 'ABS'
67
+ data['mime'] = 'MISC'
68
+ data['unit_id'] = match[2]
69
+ end
70
+
71
+
72
+ return data
73
+ end
74
+ end
@@ -1,16 +1,6 @@
1
- require 'uri'
2
- require 'cgi'
3
1
 
4
2
  module Jstor
5
- def Jstor.parse (input)
6
-
7
- uri = URI(URI.unescape(input))
8
-
9
- url = uri.path
10
- params = {}
11
- if (uri.query)
12
- params = CGI::parse(uri.query)
13
- end
3
+ def Jstor.parse (path, params)
14
4
 
15
5
  data = {
16
6
  "provider" => "jstor"
@@ -18,13 +8,13 @@ module Jstor
18
8
  doi_prefix = "10.2307"
19
9
 
20
10
 
21
- if (match = /^\/journal\/([a-z0-9]+)$/i.match(url))
11
+ if (match = /^\/journal\/([a-z0-9]+)$/i.match(path))
22
12
  data["rtype"] = "TOC"
23
13
  data["mime"] = "MISC"
24
14
  data["unit_id"] = match[1]
25
15
  data["title_id"] = match[1]
26
16
 
27
- elsif (match = /^\/stable\/10\.[0-9]+\/(([a-z]+)\.([0-9]+)\.([0-9]+)\.issue-([0-9]+))$/i.match(url))
17
+ elsif (match = /^\/stable\/10\.[0-9]+\/(([a-z]+)\.([0-9]+)\.([0-9]+)\.issue-([0-9]+))$/i.match(path))
28
18
  data["rtype"] = "TOC"
29
19
  data["mime"] = "MISC"
30
20
  data["unit_id"] = match[1]
@@ -38,7 +28,7 @@ module Jstor
38
28
  data["vol"] = match[3]
39
29
  end
40
30
 
41
- elsif (match = /^\/stable\/((10\.[0-9]+\/)?([a-z0-9]+))$/i.match(url))
31
+ elsif (match = /^\/stable\/((10\.[0-9]+\/)?([a-z0-9]+))$/i.match(path))
42
32
  data["rtype"] = "TOC"
43
33
  data["mime"] = "MISC"
44
34
  data["unit_id"] = match[3]
@@ -48,13 +38,13 @@ module Jstor
48
38
  data["doi"] = match[1]
49
39
  end
50
40
 
51
- elsif (match = /^\/stable\/(i[0-9]+)$/i.match(url))
41
+ elsif (match = /^\/stable\/(i[0-9]+)$/i.match(path))
52
42
  data["rtype"] = "TOC"
53
43
  data["mime"] = "MISC"
54
44
  data["unit_id"] = match[1]
55
45
  data["title_id"] = match[1]
56
46
 
57
- elsif (/^\/action\/showPublication$/i.match(url))
47
+ elsif (/^\/action\/showPublication$/i.match(path))
58
48
  if (params["journalCode"])
59
49
  data["title_id"] = params["journalCode"][0]
60
50
  data["unit_id"] = params["journalCode"][0]
@@ -62,7 +52,7 @@ module Jstor
62
52
  data["mime"] = 'MISC'
63
53
  end
64
54
 
65
- elsif (match = /^\/stable\/(get_image|pdf|pdfplus)\/((10\.[0-9]+\/)?([a-z0-9.]+?))(?:\.pdf)?$/i.match(url))
55
+ elsif (match = /^\/stable\/(get_image|pdf|pdfplus)\/((10\.[0-9]+\/)?([a-z0-9.]+?))(?:\.pdf)?$/i.match(path))
66
56
  data["unit_id"] = match[4]
67
57
  data["doi"] = match[3] ? match[2] : doi_prefix + "/" + match[2]
68
58
 
@@ -99,7 +89,7 @@ module Jstor
99
89
  end
100
90
  end
101
91
 
102
- elsif (match = /^\/stable\/(info|view)\/([0-9]+)$/i.match(url))
92
+ elsif (match = /^\/stable\/(info|view)\/([0-9]+)$/i.match(path))
103
93
  data["rtype"] = match[1] === 'info' ? "ABS" : "PREVIEW"
104
94
  data["mime"] = "MISC"
105
95
  data["unit_id"] = match[2]