logstash-filter-ezproxy 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +5 -5
  2. data/lib/logstash/filters/dawsonera.rb +3 -9
  3. data/lib/logstash/filters/ebscohost.rb +1 -10
  4. data/lib/logstash/filters/emerald.rb +2 -10
  5. data/lib/logstash/filters/ezproxy.rb +46 -23
  6. data/lib/logstash/filters/ft.rb +57 -0
  7. data/lib/logstash/filters/gale.rb +77 -0
  8. data/lib/logstash/filters/ieee.rb +74 -0
  9. data/lib/logstash/filters/jstor.rb +8 -18
  10. data/lib/logstash/filters/lexisnexis.rb +14 -19
  11. data/lib/logstash/filters/sage.rb +2 -5
  12. data/lib/logstash/filters/sciencedirect.rb +3 -10
  13. data/lib/logstash/filters/springer.rb +142 -0
  14. data/lib/logstash/filters/tandf.rb +1 -9
  15. data/lib/logstash/filters/wiley.rb +11 -19
  16. data/lib/logstash/helpers/url_parser.rb +23 -0
  17. data/logstash-filter-ezproxy.gemspec +3 -2
  18. data/spec/filters/dawsonera/dawsonera_spec.rb +6 -1
  19. data/spec/filters/ebscohost/ebscohost_spec.rb +7 -1
  20. data/spec/filters/emerald/emerald_spec.rb +6 -1
  21. data/spec/filters/ft/ft.2016-11-22.csv +9 -0
  22. data/spec/filters/ft/ft_spec.rb +21 -0
  23. data/spec/filters/gale/gale.2016-05-11.csv +5 -0
  24. data/spec/filters/gale/gale_spec.rb +22 -0
  25. data/spec/filters/ieee/ieee.2015-04-15.csv +16 -0
  26. data/spec/filters/ieee/ieee_spec.rb +20 -0
  27. data/spec/filters/jstor/jstor_spec.rb +6 -1
  28. data/spec/filters/lexisnexis/lexisnexis_spec.rb +6 -1
  29. data/spec/filters/sage/sage_spec.rb +6 -1
  30. data/spec/filters/sciencedirect/sciencedirect_spec.rb +6 -1
  31. data/spec/filters/springer/springer.2013-01-31.csv +21 -0
  32. data/spec/filters/springer/springer_spec.rb +23 -0
  33. data/spec/filters/tandf/tandf_spec.rb +6 -1
  34. data/spec/filters/wiley/wiley_spec.rb +6 -1
  35. metadata +34 -7
@@ -1,35 +1,30 @@
1
1
  require 'uri'
2
2
 
3
3
  module LexisNexis
4
- def LexisNexis.parse (input)
5
- uri = URI(URI.unescape(input))
6
-
7
- path = uri.path
4
+ def LexisNexis.parse (path, params)
8
5
 
9
6
  data = {
10
7
  "provider" => "lexisnexis"
11
8
  }
12
9
 
13
10
  if (/\/droit\/results\/docview\/docview/.match(path))
14
- if (uri.query)
15
- params = CGI::parse(uri.query)
16
11
 
17
- if (params["risb"])
18
- data["title_id"] = params["risb"][0]
19
- data["unit_id"] = params["risb"][0]
20
- end
12
+ if (params.key?("risb"))
13
+ data["title_id"] = params["risb"][0]
14
+ data["unit_id"] = params["risb"][0]
15
+ end
21
16
 
22
- if (params["format"])
23
- case params["format"][0]
24
- when 'GNBFULL'
25
- data["rtype"] = 'ARTICLE'
26
- data["mime"] = 'HTML'
27
- when 'AUTRECAS'
28
- data["rtype"] = 'ARTICLE'
29
- data["mime"] = 'HTML'
30
- end
17
+ if (params.key?("format"))
18
+ case params["format"][0]
19
+ when 'GNBFULL'
20
+ data["rtype"] = 'ARTICLE'
21
+ data["mime"] = 'HTML'
22
+ when 'AUTRECAS'
23
+ data["rtype"] = 'ARTICLE'
24
+ data["mime"] = 'HTML'
31
25
  end
32
26
  end
27
+
33
28
  end
34
29
 
35
30
  return data
@@ -1,11 +1,8 @@
1
1
  require "uri"
2
2
 
3
3
  module Sage
4
- def Sage.parse (input)
5
- uri = URI(URI.unescape(input))
6
-
7
- path = uri.path
8
-
4
+ def Sage.parse (path, params)
5
+
9
6
  data = {
10
7
  "provider" => "sage"
11
8
  }
@@ -1,15 +1,8 @@
1
1
  require 'uri'
2
+ require 'cgi'
2
3
 
3
4
  module ScienceDirect
4
- def ScienceDirect.parse (input)
5
- uri = URI(URI.unescape(input))
6
-
7
- path = uri.path
8
- params = {}
9
-
10
- if (uri.query)
11
- params = CGI::parse(uri.query)
12
- end
5
+ def ScienceDirect.parse (path, params)
13
6
 
14
7
  data = {
15
8
  "provider" => "sciencedirect"
@@ -30,7 +23,7 @@ module ScienceDirect
30
23
  data["pii"] = (params['_hubEid'][0] || '').split('-')[2]
31
24
 
32
25
 
33
- if (params.key?("_isbn")|| params.key?('isBook'))
26
+ if (params.key?("_isbn") || params.key?('isBook'))
34
27
  data['rtype'] = 'CHAPTERS_BUNDLE'
35
28
  data['print_identifier'] = params['_isbn'][0]
36
29
  data['title_id'] = params['_isbn'][0]
@@ -0,0 +1,142 @@
1
+ require 'uri'
2
+ require 'cgi'
3
+
4
+ module Springer
5
+ def Springer.parse (path, params)
6
+
7
+ data = {
8
+ "provider" => "springer"
9
+ }
10
+
11
+ if ((match = /\/journal(\/volumesAndIssues)?\/([0-9]+)/.match(path)))
12
+ data['title_id'] = match[2]
13
+ data['unit_id'] = match[2]
14
+ data['rtype'] = 'TOC'
15
+ data['mime'] = 'MISC'
16
+
17
+ elsif ((match = /^\/(article|book|protocol)\/([0-9]+\.[0-9]+\/[^\/]+)(\/page\/[0-9]+)?(\/fulltext.html)?/.match(path)))
18
+ data['doi'] = match[2]
19
+ data['unit_id'] = match[2].split('/')[1] + (match[3] || '')
20
+
21
+ case (match[1])
22
+ when 'article'
23
+ data['rtype'] = 'ARTICLE'
24
+ data['mime'] = 'HTML'
25
+ when 'book'
26
+ data['rtype'] = 'BOOK'
27
+ data['mime'] = 'HTML'
28
+
29
+ if (/^\/book\/([0-9]+\.[0-9]+\/([0-9-])+)$/.match(path))
30
+ data['rtype'] = 'TOC'
31
+ data['mime'] = 'MISC'
32
+ data['online_identifier'] = match[2].split('/')[1]
33
+ end
34
+
35
+ when 'protocol'
36
+ data['rtype'] = 'BOOK'
37
+ data['mime'] = 'HTML'
38
+ end
39
+
40
+ elsif ((match = /^\/content\/pdf\/(10\.[0-9]+\/(.+?))(\.pdf)?$/.match(path)))
41
+
42
+ data['doi'] = match[1]
43
+ data['unit_id'] = match[2]
44
+ data['rtype'] = 'ARTICLE'
45
+ data['mime'] = 'PDF'
46
+
47
+ if (/^(\d-*){13}(?![\d-])/.match(match[2]))
48
+ data['rtype'] = 'BOOK'
49
+ end
50
+
51
+
52
+ elsif ((match = /^\/content\/([0-9]{4}-[0-9]{4})/.match(path)))
53
+ data['print_identifier'] = match[1]
54
+ data['unit_id'] = match[1]
55
+ data['rtype'] = 'TOC'
56
+ data['mime'] = 'MISC'
57
+
58
+ elsif ((match = /^\/content\/([a-zA-Z0-9]+)(\/fulltext.pdf)?/.match(path)))
59
+ data['unit_id'] = match[1]
60
+ data['rtype'] = 'ABS'
61
+ data['mime'] = 'MISC'
62
+
63
+ elsif ((match = /^\/chapter\/(([0-9]+\.[0-9]+)\/([^\/]*))(\/([a-z]+)\.html)?/.match(path)))
64
+ data['doi'] = match[1]
65
+ data['unit_id'] = match[3]
66
+ data['rtype'] = 'BOOK_SECTION'
67
+ data['mime'] = 'HTML'
68
+
69
+ elsif ((match = /^\/(book)?series\/([0-9]+)/.match(path)))
70
+ data['title_id'] = match[2]
71
+ data['unit_id'] = match[2]
72
+ data['rtype'] = 'BOOKSERIE'
73
+ data['mime'] = 'MISC'
74
+
75
+ elsif ((match = /^\/openurl.asp/.match(path)))
76
+ if (params.key?('genre') && params['genre'][0] == 'journal')
77
+ if (params['issn'][0])
78
+ data['print_identifier'] = params['issn'][0]
79
+ data['unit_id'] = params['issn'][0]
80
+ end
81
+
82
+ data['rtype'] = 'TOC'
83
+ data['mime'] = 'MISC'
84
+ end
85
+
86
+ elsif ((match = /^\/static\/pdf\/([0-9]+)\/([a-zA-Z]{3})([^\/]+)\.pdf/.match(path)))
87
+ if ((params.key('ext') && params['ext'] == '.pdf') || params['token2'])
88
+ data['title_id'] = match[1]
89
+ data['mime'] = 'PDF'
90
+ data['unit_id'] = URI::unescape(match[3])[1..-1]
91
+
92
+ case match[2]
93
+ when 'art'
94
+ data['unit_id'] = data['unit_id'].split('/')[1]
95
+ data['doi'] = URI::unescape(match[3])[1..-1]
96
+ data['rtype'] = 'ARTICLE'
97
+
98
+ when 'chp'
99
+ data['unit_id'] = data['unit_id'].split('/')[1]
100
+ data['doi'] = URI::unescape(match[3])[1..-1]
101
+ data['rtype'] = 'BOOK_SECTION'
102
+
103
+ when 'bok'
104
+ data['online_identifier'] = data['unit_id']
105
+ data['rtype'] = 'BOOK'
106
+
107
+ when 'bfm'
108
+ data['online_identifier'] = data['unit_id'].split('/')[0]
109
+ data['rtype'] = 'TOC'
110
+
111
+ else
112
+ data['rtype'] = 'TOC'
113
+ end
114
+ end
115
+
116
+
117
+ elsif ((match = /^\/(download|static)\/([a-z]+)\/(([0-9.]*)\/([^\/]*)).epub/.match(path)))
118
+ if (/([0-9]+)\.([0-9]+)/.test(match[4]))
119
+ data['doi'] = match[3]
120
+ data['unit_id'] = match[5] + '.epub'
121
+ data['print_identifier'] = match[5]
122
+ else
123
+ if (testunitid = /([\w%]*)[A-Z]{1}([0-9-]+)/.match(match[5]))
124
+ data['unit_id'] = testunitid[2] + '.epub'
125
+ data['print_identifier'] = testunitid[2]
126
+ end
127
+ end
128
+
129
+ data['rtype'] = 'BOOK'
130
+ data['mime'] = 'EPUB'
131
+ end
132
+
133
+ if (data['doi'])
134
+ title_id = /s([0-9]+)-/.match(data['doi'])
135
+ if (title_id && title_id[1])
136
+ data['title_id'] = '' + Integer(title_id[1], 10).to_s;
137
+ end
138
+ end
139
+
140
+ return data;
141
+ end
142
+ end
@@ -3,15 +3,7 @@ require 'uri'
3
3
  require 'cgi'
4
4
 
5
5
  module TandF
6
- def TandF.parse (input)
7
-
8
- uri = URI(URI.unescape(input))
9
-
10
- path = uri.path
11
- params = {}
12
- if (uri.query)
13
- params = CGI::parse(uri.query)
14
- end
6
+ def TandF.parse (path, params)
15
7
 
16
8
  data = {
17
9
  "provider" => "tandf"
@@ -1,16 +1,11 @@
1
1
  require 'uri'
2
2
 
3
3
  module Wiley
4
- def Wiley.parse (input)
5
- uri = URI(URI.unescape(input))
6
-
7
- path = uri.path
4
+ def Wiley.parse (path, params)
8
5
 
9
6
  data = {
10
7
  "provider" => "wiley"
11
8
  }
12
-
13
- #!/usr/bin/env node
14
9
 
15
10
  if ((match = /\/journal\/(10\.[0-9]+\/(\(ISSN\)([0-9]{4}-[0-9]{3}[0-9xX])))/i.match(path)))
16
11
  data['doi'] = match[1];
@@ -27,7 +22,7 @@ module Wiley
27
22
  data['rtype'] = 'TOC';
28
23
  data['mime'] = 'MISC';
29
24
 
30
- data['publication_date'] = match[4];
25
+ data['publication_date'] = match[4];
31
26
 
32
27
  elsif ((match = /^\/doi\/(10\.[0-9]+\/(j\.([0-9]{4}-[0-9]{3}[0-9xX])\.([0-9]{4})\.[^.]+\.[^.]+))\/abstract$/i.match(path)))
33
28
  data['doi'] = match[1];
@@ -35,8 +30,8 @@ module Wiley
35
30
  data['rtype'] = 'ABS';
36
31
  data['mime'] = 'MISC';
37
32
 
38
- data['online_identifier'] = match[3];
39
- data['publication_date'] = match[4];
33
+ data['online_identifier'] = match[3];
34
+ data['publication_date'] = match[4];
40
35
 
41
36
  elsif ((match = /^\/doi\/(10\.[0-9]+\/(([^.]+)\.([0-9]{4})[0-9]+))\/abstract$/i.match(path)))
42
37
  data['doi'] = match[1];
@@ -45,7 +40,7 @@ module Wiley
45
40
  data['rtype'] = 'ABS';
46
41
  data['mime'] = 'MISC';
47
42
 
48
- data['publication_date'] = match[4];
43
+ data['publication_date'] = match[4];
49
44
 
50
45
  elsif ((match = /^\/doi\/(10\.[0-9]+\/(([^.]+)\.[0-9]+))\/full$/i.match(path)))
51
46
  data['doi'] = match[1];
@@ -116,18 +111,15 @@ module Wiley
116
111
  data['rtype'] = 'ARTICLE';
117
112
  data['mime'] = 'READCUBE';
118
113
 
119
- if (uri.query)
120
- params = CGI::parse(uri.query)
121
-
122
- if (params["resource"])
114
+ if (params.key?("resource"))
123
115
 
124
- data['doi'] = params["resource"][0];
125
- data['unit_id'] = params["resource"][0].split('/')[1];
126
- if ((match = /(10\.[0-9]+)\/([0-9]{4})([a-z0-9]{2})([^\/]+)$/i.match(params['resource'][0])))
127
- data['title_id'] = match[3].upcase;
128
- end
116
+ data['doi'] = params["resource"][0];
117
+ data['unit_id'] = params["resource"][0].split('/')[1];
118
+ if ((match = /(10\.[0-9]+)\/([0-9]{4})([a-z0-9]{2})([^\/]+)$/i.match(params['resource'][0])))
119
+ data['title_id'] = match[3].upcase;
129
120
  end
130
121
  end
122
+
131
123
 
132
124
 
133
125
  elsif ((match = /^\/doi\/(10\.[0-9]+\/([a-z]{1}[0-9]{8}([0-9]{2})[a-z0-9]+))\/pdf$/i.match(path)))
@@ -0,0 +1,23 @@
1
+ require 'uri'
2
+ require 'cgi'
3
+
4
+ module URLParser
5
+ def URLParser.parse (input)
6
+ begin
7
+ uri = URI(input)
8
+ path = URI::unescape(uri.path)
9
+ params = {}
10
+ if (uri.query)
11
+ params = CGI::parse(URI::unescape(uri.query))
12
+ end
13
+ return {
14
+ "uri" => uri,
15
+ "path" => path,
16
+ "params" => params
17
+ }
18
+ rescue
19
+ puts input
20
+ return {}
21
+ end
22
+ end
23
+ end
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'logstash-filter-ezproxy'
3
- s.version = '0.1.0'
3
+ s.version = '0.1.1'
4
4
  s.licenses = ['Apache-2.0']
5
5
  s.summary = 'Write a short summary, because Rubygems requires one.'
6
6
  s.authors = ['Dom Belcher']
@@ -16,6 +16,7 @@ Gem::Specification.new do |s|
16
16
  s.metadata = { "logstash_plugin" => "true", "logstash_group" => "filter" }
17
17
 
18
18
  # Gem dependencies
19
- s.add_runtime_dependency "logstash-core-plugin-api", "~> 2.0"
19
+ #s.add_dependency "logstash-core", "~> 6.1.0"
20
+ s.add_runtime_dependency "logstash-core-plugin-api", ">= 1.60", "<= 2.99"
20
21
  s.add_development_dependency 'logstash-devutils'
21
22
  end
@@ -1,11 +1,16 @@
1
1
  require 'logstash/filters/dawsonera'
2
2
  require 'csv'
3
+ require 'logstash/helpers/url_parser'
3
4
 
4
5
  RSpec.describe DawsonEra do
5
6
  CSV.foreach('spec/filters/dawsonera/dawsonera.2014-09-03.csv', { :col_sep => ';', headers: true }) do |row|
7
+ parsed_url = URLParser::parse(row[4])
8
+ path = parsed_url['path']
9
+ params = parsed_url['params']
10
+
6
11
  name = "tests a URL " + row[4]
7
12
  it name do
8
- data = DawsonEra.parse(row[4])
13
+ data = DawsonEra.parse(path, params)
9
14
  expect(data["unit_id"]).to eq(row[0])
10
15
  expect(data["online_identifier"]).to eq(row[1])
11
16
  expect(data["rtype"]).to eq(row[2])
@@ -1,11 +1,17 @@
1
1
  require 'logstash/filters/ebscohost'
2
+ require 'logstash/helpers/url_parser'
2
3
  require 'csv'
3
4
 
4
5
  RSpec.describe Ebscohost do
5
6
  CSV.foreach('spec/filters/ebscohost/ebscohost.2014-08-21.csv', { :col_sep => ';', headers: true }) do |row|
7
+ parsed_url = URLParser::parse(row[11])
8
+ uri = parsed_url['uri']
9
+ path = parsed_url['path']
10
+ params = parsed_url['params']
11
+
6
12
  name = "tests a URL " + row[11]
7
13
  it name do
8
- data = Ebscohost.parse(row[11])
14
+ data = Ebscohost.parse(path, params, uri)
9
15
  expect(data["unit_id"]).to eq(row[0])
10
16
  expect(data["rtype"]).to eq(row[1])
11
17
  expect(data["mime"]).to eq(row[2])
@@ -1,11 +1,16 @@
1
1
  require 'logstash/filters/emerald'
2
+ require 'logstash/helpers/url_parser'
2
3
  require 'csv'
3
4
 
4
5
  RSpec.describe Emerald do
5
6
  CSV.foreach('spec/filters/emerald/emerald.2015-08-11.csv', { :col_sep => ';', headers: true }) do |row|
7
+ parsed_url = URLParser::parse(row[6])
8
+ path = parsed_url['path']
9
+ params = parsed_url['params']
10
+
6
11
  name = "tests a URL " + row[6]
7
12
  it name do
8
- data = Emerald.parse(row[6])
13
+ data = Emerald.parse(path, params)
9
14
  expect(data["title_id"]).to eq(row[0])
10
15
  expect(data["doi"]).to eq(row[1])
11
16
  expect(data["unit_id"]).to eq(row[2])
@@ -0,0 +1,9 @@
1
+ out-title_id;out-unitid;out-publication_date;out-rtype;out-mime;in-url
2
+ ;0b4a4790-6454-11e6-8310-ecf0bddad227;;ARTICLE;HTML;http://www.ft.com/cms/s/0/0b4a4790-6454-11e6-8310-ecf0bddad227.html#axzz4HgTPkTq0
3
+ 5088258522001;Market-Minute-Federal-Reserve-in-focus;;VIDEO;MISC;http://video.ft.com/5088258522001/Market-Minute-Federal-Reserve-in-focus/Editors-Choice
4
+ 2172893;juicing-the-numbers-is-ok-if-youre-in-silicon-valley-apparently;2016;ARTICLE;HTML;http://ftalphaville.ft.com/2016/08/18/2172893/juicing-the-numbers-is-ok-if-youre-in-silicon-valley-apparently/
5
+ ;;;ARTICLE;HTML;http://digital.olivesoftware.com/Olive/ODE/FTePaperUK/?simigvis=OC4zMzguODQzMzc4ODU1NTI4MDQuMTQ3MTUyMTE1ODI3My43ZGE0ZWZkZQ__*
6
+ ;;;TOOL;HTML;http://markets.ft.com/data/portfolio/dashboard
7
+ ;emerging-voices;;REPORT;HTML;http://www.ft.com/reports/emerging-voices
8
+ ;0b5de310-6b56-11e5-8171-ba1968cf791a;;REPORT;PDF;http://im.ft-static.com/content/images/0b5de310-6b56-11e5-8171-ba1968cf791a.pdf
9
+ ;global-mba-ranking-2016;2016;TOOL;HTML;http://rankings.ft.com/businessschoolrankings/mcgill-university/global-mba-ranking-2016#global-mba-ranking-2016
@@ -0,0 +1,21 @@
1
+ require 'logstash/filters/ft'
2
+ require 'logstash/helpers/url_parser'
3
+ require 'csv'
4
+
5
+ RSpec.describe FT do
6
+ CSV.foreach('spec/filters/ft/ft.2016-11-22.csv', { :col_sep => ';', headers: true }) do |row|
7
+ parsed_url = URLParser::parse(row[5])
8
+ path = parsed_url['path']
9
+ params = parsed_url['params']
10
+
11
+ name = "tests a URL " + row[5]
12
+ it name do
13
+ data = FT.parse(path, params)
14
+ expect(data["title_id"]).to eq(row[0])
15
+ expect(data["unit_id"]).to eq(row[1])
16
+ expect(data["publication_date"]).to eq(row[2])
17
+ expect(data["rtype"]).to eq(row[3])
18
+ expect(data["mime"]).to eq(row[4])
19
+ end
20
+ end
21
+ end