referer-parser 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZWM4OWFlNDZlN2FlOWMwYzhiZmE4YmJmN2I5MjBjYTU1MTJhNmIyZA==
4
+ YzNmNmNlYjE3ZDdlZGY5M2FjNjAzODFkZGJlNjJkZGJiMzEzOWM0OA==
5
5
  data.tar.gz: !binary |-
6
- YmYwNTJiZDVkZThkODAwOThmNGNiYTQ2NzM5MmI5NzlmNTRmMWE4MA==
6
+ MDkyODIyYTdkMjg2ZjYxOGEwNDc3YjcwODE5Zjk2N2Y3YTcxNGNmNw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- YzQxYjI1NjA3NmQzYTc1YmEzYjZkYzNhNjMwYjIyOGY4MjU0YzFhZTAzMjc3
10
- M2FjY2FmOGIyZDFhZmFjMmViNTkxODA1MDhkMjA1ZjA3MmE0NTZjMjg4OWJi
11
- OTllZTdhNTg4NzQyYWYxNjg3MDdhYTczNmU0MzZmZWU5NDFmNTc=
9
+ NDU5NjM0OGVkMjM3N2M5YTQxMWFkYzU3NTYwZDdjODdkZmJkYmIwMDZlZmUw
10
+ YzI3NzNhMjllOWU1NTk4Yzg5YjUyMDYzOWM2ZjU0OTBhNDU4YmU0Nzc4YjBk
11
+ MWI2NzU0ZjNjZjA5ZWNlZjU5M2U3OTU4MGJiMDk4Y2ViMTJiZGQ=
12
12
  data.tar.gz: !binary |-
13
- ODMzYzdmYjFlNmYyNmQzNjIxMmY0OTQ5ZmU0OTZhZWVjMGQ5YTc2MzQwMWE4
14
- ZmJjYTQ1ODcxZWM3MzFlZDE1NGQzZTFlZWVlMGQ3YzNmMjZhMzc3OGU0YmE1
15
- YzJlNmJkMzhhN2QxMzdiNjY3Yzk2ZjdjZGFkOGRiNzU5NjQyODc=
13
+ M2M2YWNhOTc4ODEwMjQxNTdjZTA4YjE5ZDZmMWZjNjJhNGFlMjA0MGRlZjEw
14
+ NGFkNTQ0MGYyOTlhNWNkYjZlYjhkYzg4NWJlYTU3ZDc1MDRmNjBlM2FkMTEz
15
+ ZWMzYTdlNWEyYmFkZWU3M2Y5NjI5YThhNDczNGZkMDZmMTk5MmY=
data/README.md CHANGED
@@ -4,8 +4,6 @@ This is the Ruby implementation of [referer-parser] [referer-parser], the librar
4
4
 
5
5
  The implementation uses the shared 'database' of known referers found in [`referers.yml`] [referers-yml].
6
6
 
7
- **Currently the Ruby library only extracts search engine referers - it needs updating with the additional functionality now found in the Java/Scala version.**
8
-
9
7
  ## Installation
10
8
 
11
9
  Add this line to your application's Gemfile:
@@ -22,20 +20,55 @@ Or install it yourself as:
22
20
 
23
21
  ## Usage
24
22
 
25
- Use referer-parser like this:
23
+ ### To include referer-parser:
26
24
 
27
25
  ```ruby
28
26
  require 'referer-parser'
27
+ ```
28
+
29
+ ### To create a parser
30
+
31
+ Parsers are created by default with the set of included referers but they can also be loaded from another file(s) either during or after instantiation
32
+
33
+ Creating and modifying the parser:
34
+
35
+ ```ruby
36
+ # Default parser
37
+ parser = RefererParser::Parser.new
29
38
 
30
- referer_url = 'http://www.google.com/search?q=gateway+oracle+cards+denise+linn&hl=en&client=safari'
39
+ # Custom parser with local file
40
+ parser = RefererParser::Parser.new('/path/to/other/referers.yml')
31
41
 
32
- r = RefererParser::Referer.new(referer_url)
42
+ # From a URI
43
+ parser = RefererParser::Parser.new('http://example.com/path/to/other/referers.yml')
33
44
 
34
- puts r.known? # => true
35
- puts r.referer # => 'Google'
36
- puts r.search_parameter # => 'q'
37
- puts r.search_term # => 'gateway oracle cards denise linn'
38
- puts r.uri.host # => 'www.google.com'
45
+ # Default referers, then merge in a set of custom internal domains
46
+ parser = RefererParser::Parser.new
47
+ parser.update('/path/to/internal.yml')
48
+
49
+ # Default referers, then add your own internal domain inline instead of from a file
50
+ parser = RefererParser::Parser.new
51
+ parser.add_referer('internal', 'SnowPlow', 'snowplowanalytics.com')
52
+
53
+ # Clear all of the existing referers
54
+ parser.clear!
55
+ ```
56
+
57
+ ### Using a parser
58
+
59
+ The parser returns a hash of matching data if it can be found including search terms, medium, and nicely-formatted source name.
60
+ If there is no match, :known will be false.
61
+
62
+ ```ruby
63
+ parser = RefererParser::Parser.new
64
+ parser.parse('http://www.google.com/search?q=gateway+oracle+cards+denise+linn&hl=en&client=safari')
65
+ # => {
66
+ :known=>true,
67
+ :uri=>"http://www.google.com/search?q=gateway+oracle+cards+denise+linn&hl=en&client=safari",
68
+ :source=>"Google",
69
+ :medium=>"search",
70
+ :term=>"gateway oracle cards denise linn"
71
+ }
39
72
  ```
40
73
 
41
74
  ## Contributing
@@ -48,7 +81,7 @@ puts r.uri.host # => 'www.google.com'
48
81
 
49
82
  ## Copyright and license
50
83
 
51
- The referer-parser Ruby library is copyright 2012-2013 Snowplow Analytics Ltd.
84
+ The referer-parser Ruby library is copyright 2014 Inside Systems, Inc.
52
85
 
53
86
  Licensed under the [Apache License, Version 2.0] [license] (the "License");
54
87
  you may not use this software except in compliance with the License.
@@ -62,4 +95,4 @@ limitations under the License.
62
95
  [referer-parser]: https://github.com/snowplow/referer-parser
63
96
  [referers-yml]: https://github.com/snowplow/referer-parser/blob/master/referers.yml
64
97
 
65
- [license]: http://www.apache.org/licenses/LICENSE-2.0
98
+ [license]: http://www.apache.org/licenses/LICENSE-2.0
data/Rakefile CHANGED
@@ -1 +1,10 @@
1
1
  require "bundler/gem_tasks"
2
+
3
+ require 'rspec/core'
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec) do |spec|
7
+ spec.pattern = FileList['spec/**/*_spec.rb']
8
+ end
9
+
10
+ task :default => :spec
@@ -15,9 +15,7 @@
15
15
 
16
16
  require "referer-parser/version"
17
17
  require "referer-parser/errors"
18
- require "referer-parser/referers"
19
- require "referer-parser/referer"
18
+ require "referer-parser/parser"
20
19
 
21
20
  module RefererParser
22
- # Your code goes here...
23
- end
21
+ end
@@ -15,8 +15,6 @@
15
15
 
16
16
  module RefererParser
17
17
 
18
- # Errors thrown by RefererParser
19
-
20
18
  class RefererParserError < StandardError
21
19
  attr_reader :original
22
20
  def initialize(msg, original=nil);
@@ -25,12 +23,7 @@ module RefererParser
25
23
  end
26
24
  end
27
25
 
28
- class InvalidUriError < StandardError
29
- end
30
-
31
- class ReferersYamlNotFoundError < StandardError
32
- end
33
-
34
- class CorruptReferersYamlError < StandardError
35
- end
36
- end
26
+ class UnsupportedFormatError < RefererParserError; end
27
+ class InvalidUriError < RefererParserError; end
28
+ class CorruptReferersError < RefererParserError; end
29
+ end
@@ -0,0 +1,215 @@
1
+ # Copyright (c) 2014 Inside Systems, Inc All rights reserved.
2
+ #
3
+ # This program is licensed to you under the Apache License Version 2.0,
4
+ # and you may not use this file except in compliance with the Apache License Version 2.0.
5
+ # You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
6
+ #
7
+ # Unless required by applicable law or agreed to in writing,
8
+ # software distributed under the Apache License Version 2.0 is distributed on an
9
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
11
+
12
+ # Author:: Kelley Reynolds (mailto:kelley@insidesystems.net)
13
+ # Copyright:: Copyright (c) 2014 Inside Systems Inc
14
+ # License:: Apache License Version 2.0
15
+
16
+ require 'uri'
17
+ require 'cgi'
18
+
19
+ module RefererParser
20
+ class Parser
21
+ DefaultFile = File.expand_path(File.join(File.dirname(__FILE__), '..', '..', 'data', 'referers.json'))
22
+
23
+ # Create a new parser from one or more filenames/uris, defaults to ../data/referers.json
24
+ def initialize(uris=DefaultFile)
25
+ @domain_index ||= {}
26
+ @name_hash ||= {}
27
+
28
+ update(uris)
29
+ end
30
+
31
+ # Update the referer database with one or more uris
32
+ def update(uris)
33
+ [uris].flatten.each do |uri|
34
+ deserialize_referer_data(read_referer_data(uri), File.extname(uri).downcase)
35
+ end
36
+
37
+ true
38
+ end
39
+
40
+ # Clean out the database
41
+ def clear!
42
+ @domain_index, @name_hash = {}, {}
43
+
44
+ true
45
+ end
46
+
47
+ # Add a referer to the database with medium, name, domain or array of domains, and a parameter or array of parameters
48
+ # If called manually and a domain is added to an existing entry with a path, you may need to call optimize_index! afterwards.
49
+ def add_referer(medium, name, domains, parameters=nil)
50
+ # The same name can be used with multiple mediums so we make a key here
51
+ name_key = "#{name}-#{medium}"
52
+
53
+ # Update the name has with the parameter and medium data
54
+ @name_hash[name_key] = {:source => name, :medium => medium, :parameters => [parameters].flatten }
55
+
56
+ # Update the domain to name index
57
+ [domains].flatten.each do |domain_url|
58
+ domain, *path = domain_url.split('/')
59
+ if domain =~ /\Awww\.(.*)\z/i
60
+ domain = $1
61
+ end
62
+
63
+ domain.downcase!
64
+
65
+ @domain_index[domain] ||= []
66
+ if !path.empty?
67
+ @domain_index[domain] << ['/' + path.join('/'), name_key]
68
+ else
69
+ @domain_index[domain] << ['/', name_key]
70
+ end
71
+ end
72
+ end
73
+
74
+ # Prune duplicate entries and sort with the most specific path first if there is more than one entry
75
+ # In this case, sorting by the longest string works fine
76
+ def optimize_index!
77
+ @domain_index.each do |key, val|
78
+ # Sort each path/name_key pair by the longest path
79
+ @domain_index[key].sort! { |a, b|
80
+ b[0].size <=> a[0].size
81
+ }.uniq!
82
+ end
83
+ end
84
+
85
+ # Given a string or URI, return a hash of data
86
+ def parse(obj)
87
+ url = obj.is_a?(URI) ? obj : URI.parse(obj.to_s)
88
+
89
+ if !['http', 'https'].include?(url.scheme)
90
+ raise InvalidUriError.new("Only HTTP and HTTPS schemes are supported -- #{url.scheme}")
91
+ end
92
+
93
+ data = { :known => false, :uri => url.to_s }
94
+
95
+ domain, name_key = domain_and_name_key_for(url)
96
+ if domain and name_key
97
+ referer_data = @name_hash[name_key]
98
+ data[:known] = true
99
+ data[:source] = referer_data[:source]
100
+ data[:medium] = referer_data[:medium]
101
+ data[:domain] = domain
102
+
103
+ # Parse parameters if the referer uses them
104
+ if url.query and referer_data[:parameters]
105
+ query_params = CGI.parse(url.query)
106
+ referer_data[:parameters].each do |param|
107
+ # If there is a matching parameter, get the first non-blank value
108
+ if !(values = query_params[param]).empty?
109
+ data[:term] = values.select { |v| v.strip != "" }.first
110
+ break if data[:term]
111
+ end
112
+ end
113
+ end
114
+ end
115
+
116
+ data
117
+ rescue URI::InvalidURIError
118
+ raise InvalidUriError.new("Unable to parse URI, not a URI? -- #{obj.inspect}", $!)
119
+ end
120
+
121
+ protected
122
+
123
+ # Determine the correct name_key for this host and path
124
+ def domain_and_name_key_for(uri)
125
+ # Create a proc that will return immediately
126
+ check = Proc.new do |domain|
127
+ domain.downcase!
128
+ if paths = @domain_index[domain]
129
+ paths.each do |path, name_key|
130
+ return [domain, name_key] if uri.path.include?(path)
131
+ end
132
+ end
133
+ end
134
+
135
+ # First check hosts with and without the www prefix with the path
136
+ if uri.host =~ /\Awww\.(.+)\z/i
137
+ check.call $1
138
+ else
139
+ check.call uri.host
140
+ end
141
+
142
+ # Remove subdomains until only three are left (probably good enough)
143
+ host_arr = uri.host.split(".")
144
+ while host_arr.size > 2 do
145
+ host_arr.shift
146
+ check.call host_arr.join(".")
147
+ end
148
+
149
+ nil
150
+ end
151
+
152
+ def deserialize_referer_data(data, ext)
153
+ # Parse the loaded data with the correct parser
154
+ deserialized_data = if ['.yml', '.yaml'].include?(ext)
155
+ deserialize_yaml(data)
156
+ elsif ext == '.json'
157
+ deserialize_json(data)
158
+ else
159
+ raise UnsupportedFormatError.new("Only yaml and json file formats are currently supported -- #{@msg}")
160
+ end
161
+
162
+ begin
163
+ parse_referer_data deserialized_data
164
+ rescue
165
+ raise CorruptReferersError.new("Unable to parse data file -- #{$!.class} #{$!.to_s}", $!)
166
+ end
167
+ end
168
+
169
+ def deserialize_yaml(data)
170
+ require 'yaml'
171
+ YAML.load(data)
172
+ rescue Exception => e
173
+ raise CorruptReferersError.new("Unable to YAML file -- #{e.to_s}", e)
174
+ end
175
+
176
+ def deserialize_json(data)
177
+ require 'json'
178
+ JSON.parse(data)
179
+ rescue JSON::ParserError
180
+ raise CorruptReferersError.new("Unable to JSON file -- #{$!.to_s}", $!)
181
+ end
182
+
183
+ def read_referer_data(uri)
184
+ # Attempt to read the data from the network if application, or the file on the local system
185
+ if uri =~ /\A(?:ht|f)tps?:\/\//
186
+ require 'open-uri'
187
+ begin
188
+ open(uri).read
189
+ rescue OpenURI::HTTPError
190
+ raise InvalidUriError.new("Cannot load referer data from URI #{uri} -- #{$!.to_s}", $!)
191
+ end
192
+ else
193
+ File.read(uri)
194
+ end
195
+ end
196
+
197
+ # Create an index that maps domains/paths to their name/medium and a hash that contains their metadata
198
+ # The index strips leading www in order to keep the index smaller
199
+ # Format of the domain_index:
200
+ # { domain => [[path1, name_key], [path2, name_key], ... ] }
201
+ # Format of the name_hash:
202
+ # { name_key => {:source, :medium, :parameters} }
203
+ def parse_referer_data(data)
204
+ data.each do |medium, name_hash|
205
+ name_hash.each do |name, name_data|
206
+ add_referer(medium, name, name_data['domains'], name_data['parameters'])
207
+ end
208
+ end
209
+
210
+ optimize_index!
211
+ rescue
212
+ raise CorruptReferersError.new("Unable to parse referer data", $!)
213
+ end
214
+ end
215
+ end
@@ -14,6 +14,6 @@
14
14
  # License:: Apache License Version 2.0
15
15
 
16
16
  module RefererParser
17
- NAME = "referer-parser"
18
- VERSION = "0.2.2"
17
+ NAME = "referer-parser"
18
+ VERSION = "0.3.0"
19
19
  end
@@ -19,7 +19,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
19
19
  require 'referer-parser/version'
20
20
 
21
21
  Gem::Specification.new do |gem|
22
- gem.authors = ["Yali Sassoon", "Martin Loy", "Alex Dean"]
22
+ gem.authors = ["Yali Sassoon", "Martin Loy", "Alex Dean", "Kelley Reynolds"]
23
23
  gem.email = ["support@snowplowanalytics.com"]
24
24
  gem.description = %q{Library for extracting marketing attribution data from referer URLs}
25
25
  gem.summary = %q{Library for extracting marketing attribution data (e.g. search terms) from referer (sic) URLs. This is used by Snowplow (http://github.com/snowplow/snowplow). Our hope is that this library (and referers.yml) will be extended by anyone interested in parsing referer URLs.}
@@ -34,4 +34,5 @@ Gem::Specification.new do |gem|
34
34
  gem.require_paths = ["lib"]
35
35
 
36
36
  gem.add_development_dependency "rspec", "~> 2.6"
37
+ gem.add_development_dependency "rake", ">= 0.9.2"
37
38
  end
@@ -0,0 +1,9 @@
1
+ {
2
+ "internal": {
3
+ "SnowPlow": {
4
+ "domains": [
5
+ "www.snowplowanalytics.com"
6
+ ]
7
+ }
8
+ }
9
+ }
@@ -0,0 +1 @@
1
+ This has the right extension but is unparsable gibberish to json:{}}}}
@@ -0,0 +1,2 @@
1
+ this:is invalid:yaml:
2
+ !!!
@@ -121,7 +121,7 @@
121
121
  },
122
122
  {
123
123
  "spec": "Ask toolbar search #2",
124
- "uri": "http://search.tb.ask.com/search/GGmain.jhtml?&st=hp&p2=^ZU^xdm458^YYA^us&n=77fda1bd&ptb=F0B68CA5-4791-4376-BFCC-5F0100329FB6&si=CMKg9-nX07oCFSjZQgodcikACQ&tpr=hpsbsug&searchfor=test",
124
+ "uri": "http://search.tb.ask.com/search/GGmain.jhtml?&st=hp&p2=%5EZU%5Exdm458%5EYYA%5Eus&n=77fda1bd&ptb=F0B68CA5-4791-4376-BFCC-5F0100329FB6&si=CMKg9-nX07oCFSjZQgodcikACQ&tpr=hpsbsug&searchfor=test",
125
125
  "medium": "search",
126
126
  "source": "Ask Toolbar",
127
127
  "term": "test",
@@ -129,7 +129,7 @@
129
129
  },
130
130
  {
131
131
  "spec": "Voila search",
132
- "uri": "http://lemoteur.ke.voila.fr/?module=voila&bhv=web_fr&kw=test",
132
+ "uri": "http://search.ke.voila.fr/?module=voila&bhv=web_fr&kw=test",
133
133
  "medium": "search",
134
134
  "source": "Voila",
135
135
  "term": "test",
@@ -219,16 +219,16 @@
219
219
  "spec": "Internal HTTP",
220
220
  "uri": "http://www.snowplowanalytics.com/about/team",
221
221
  "medium": "internal",
222
- "source": null,
222
+ "source": "SnowPlow",
223
223
  "term": null,
224
- "known": false
224
+ "known": true
225
225
  },
226
226
  {
227
227
  "spec": "Internal HTTPS",
228
228
  "uri": "https://www.snowplowanalytics.com/account/profile",
229
229
  "medium": "internal",
230
- "source": null,
230
+ "source": "SnowPlow",
231
231
  "term": null,
232
- "known": false
232
+ "known": true
233
233
  }
234
234
  ]
@@ -0,0 +1,181 @@
1
+ # Copyright (c) 2014 Inside Systems, Inc All rights reserved.
2
+ #
3
+ # This program is licensed to you under the Apache License Version 2.0,
4
+ # and you may not use this file except in compliance with the Apache License Version 2.0.
5
+ # You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
6
+ #
7
+ # Unless required by applicable law or agreed to in writing,
8
+ # software distributed under the Apache License Version 2.0 is distributed on an
9
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
11
+
12
+ # Author:: Kelley Reynolds (mailto:kelley@insidesystems.net)
13
+ # Copyright:: Copyright (c) 2014 Inside Systems, Inc
14
+ # License:: Apache License Version 2.0
15
+
16
+ require 'spec_helper'
17
+
18
+ describe RefererParser::Parser do
19
+ let(:remote_file) { "https://raw.githubusercontent.com/snowplow/referer-parser/master/ruby/data/referers.json" }
20
+ let(:default_parser) { RefererParser::Parser.new }
21
+ let(:internal_parser) { RefererParser::Parser.new(fixture('internal.json')) }
22
+ let(:combined_parser) { RefererParser::Parser.new([RefererParser::Parser::DefaultFile, fixture('internal.json')]) }
23
+ let(:remote_parser) { RefererParser::Parser.new(remote_file) }
24
+ let(:domain_index) { parser.instance_variable_get(:@domain_index) }
25
+ let(:name_hash) { parser.instance_variable_get(:@name_hash) }
26
+
27
+ # This gets overridden for different parsers in subsections
28
+ let(:parser) { default_parser }
29
+
30
+ describe "exceptions" do
31
+ it "should raise UnsupportedFormatError" do
32
+ lambda { parser.update(__FILE__) }.should raise_error(RefererParser::UnsupportedFormatError)
33
+ end
34
+
35
+ it "should raise CorruptReferersError with invalid json" do
36
+ lambda { parser.update(fixture('invalid.json')) }.should raise_error(RefererParser::CorruptReferersError)
37
+ end
38
+
39
+ it "should raise CorruptReferersError with invalid yaml" do
40
+ lambda { parser.update(fixture('invalid.yml')) }.should raise_error(RefererParser::CorruptReferersError)
41
+ end
42
+
43
+ it "should raise CorruptReferersError with valid file with invalid data" do
44
+ lambda { parser.update(fixture('referer-tests.json')) }.should raise_error(RefererParser::CorruptReferersError)
45
+ end
46
+
47
+ it "should raise InvalidUriError with insane" do
48
+ lambda { parser.parse('>total gibberish<') }.should raise_error(RefererParser::InvalidUriError)
49
+ end
50
+
51
+ it "should raise InvalidUriError with non http(s)" do
52
+ lambda { parser.parse('ftp://ftp.really.com/whatever.json') }.should raise_error(RefererParser::InvalidUriError)
53
+ end
54
+ end
55
+
56
+ describe "with the default parser" do
57
+ it "should have a non-empty domain_index" do
58
+ domain_index.should_not be_empty
59
+ end
60
+
61
+ it "should have a non-empty name_hash" do
62
+ name_hash.should_not be_empty
63
+ end
64
+
65
+ it "should be clearable" do
66
+ parser.clear!
67
+ name_hash.should be_empty
68
+ domain_index.should be_empty
69
+ end
70
+
71
+ it "should be updatable" do
72
+ size = domain_index.size
73
+ parser.update(fixture('internal.json'))
74
+ domain_index.size.should > size
75
+ end
76
+ end
77
+
78
+ describe "with the internal parser" do
79
+ let(:parser) { internal_parser }
80
+
81
+ it "should have internal mediums only" do
82
+ domain_index.each_value do |(arr)|
83
+ path, name_key = arr[0], arr[1]
84
+ name_hash[name_key][:medium].should == 'internal'
85
+ end
86
+ end
87
+ end
88
+
89
+ describe "with the remote parser" do
90
+ let(:parser) { remote_parser }
91
+
92
+ # These are combined here to reduce network fetches
93
+ it "should have a non-empty domain_index and name_hash" do
94
+ domain_index.should_not be_empty
95
+ name_hash.should_not be_empty
96
+ end
97
+ end
98
+
99
+ describe "sample fixtures" do
100
+ let(:parser) { combined_parser }
101
+ # Include our internal data as well
102
+ JSON.parse(File.read(File.join(File.dirname(__FILE__), 'fixtures', 'referer-tests.json'))).each do |fixture|
103
+ it fixture['spec'] do
104
+ parsed_as_string, parsed_as_uri = nil, nil
105
+ lambda { parsed_as_string = parser.parse(fixture['uri']) }.should_not raise_error
106
+ lambda { parsed_as_uri = parser.parse(URI.parse(fixture['uri'])) }.should_not raise_error
107
+
108
+ ['source', 'term', 'known', 'medium'].each do |key|
109
+ parsed_as_uri[key.to_sym].should == fixture[key]
110
+ parsed_as_string[key.to_sym].should == fixture[key]
111
+ end
112
+ end
113
+ end
114
+ end
115
+
116
+ describe "general behavior" do
117
+ it "should return the better result when the referer contains two or more parameters" do
118
+ parsed = parser.parse("http://search.tiscali.it/?tiscalitype=web&collection=web&q=&key=hello")
119
+ parsed[:term].should == "hello"
120
+ end
121
+
122
+ it "should return the better result when the referer contains same parameters" do
123
+ parsed = parser.parse("http://search.tiscali.it/?tiscalitype=web&collection=web&key=&key=hello")
124
+ parsed[:term].should == "hello"
125
+ end
126
+
127
+ it "should return the normalized domain" do
128
+ parsed = parser.parse("http://it.images.search.YAHOO.COM/images/view;_ylt=A0PDodgQmGBQpn4AWQgdDQx.;_ylu=X3oDMTBlMTQ4cGxyBHNlYwNzcgRzbGsDaW1n?back=http%3A%2F%2Fit.images.search.yahoo.com%2Fsearch%2Fimages%3Fp%3DEarth%2BMagic%2BOracle%2BCards%26fr%3Dmcafee%26fr2%3Dpiv-web%26tab%3Dorganic%26ri%3D5&w=1064&h=1551&imgurl=mdm.pbzstatic.com%2Foracles%2Fearth-magic-oracle-cards%2Fcard-1.png&rurl=http%3A%2F%2Fwww.psychicbazaar.com%2Foracles%2F143-earth-magic-oracle-cards.html&size=2.8+KB&name=Earth+Magic+Oracle+Cards+-+Psychic+Bazaar&p=Earth+Magic+Oracle+Cards&oid=f0a5ad5c4211efe1c07515f56cf5a78e&fr2=piv-web&fr=mcafee&tt=Earth%2BMagic%2BOracle%2BCards%2B-%2BPsychic%2BBazaar&b=0&ni=90&no=5&ts=&tab=organic&sigr=126n355ib&sigb=13hbudmkc&sigi=11ta8f0gd&.crumb=IZBOU1c0UHU")
129
+ parsed[:domain].should == "images.search.yahoo.com"
130
+ end
131
+ end
132
+
133
+ describe "optimize_index" do
134
+ let(:domains) { ['fnord.com', 'fnord.com', 'fnord.com/path'] }
135
+
136
+ before do
137
+ parser.add_referer('internal', 'Fnord', domains)
138
+ end
139
+
140
+ it "should have out of order and duplicate domains before optimization" do
141
+ domain_index['fnord.com'].transpose.first.should == ['/', '/', '/path']
142
+ end
143
+
144
+ it "should have out of order domains before optimization" do
145
+ parser.optimize_index!
146
+ domain_index['fnord.com'].transpose.first.should == ['/path', '/']
147
+ end
148
+ end
149
+
150
+ describe "add_referer" do
151
+ it "should add a referer to the domain_index" do
152
+ domain_index['fnord.com'].should be_nil
153
+ parser.add_referer('internal', 'Fnord', 'fnord.com')
154
+ domain_index['fnord.com'].should_not be_nil
155
+ end
156
+
157
+ it "should add a referer with multiple domains to the domain_index" do
158
+ domain_index['fnord.com'].should be_nil
159
+ domain_index['boo.com'].should be_nil
160
+ parser.add_referer('internal', 'Fnord', ['fnord.com', 'boo.com'])
161
+ domain_index['fnord.com'].should_not be_nil
162
+ domain_index['boo.com'].should_not be_nil
163
+ end
164
+
165
+ it "should add a referer to the name_hash" do
166
+ name_hash['fnord.com-internal'].should be_nil
167
+ parser.add_referer('internal', 'Fnord', 'fnord.com')
168
+ name_hash['Fnord-internal'].should_not be_nil
169
+ end
170
+
171
+ it "should add parameters to the name_hash" do
172
+ parser.add_referer('internal', 'Fnord', 'fnord.com', ['Q', 'q'])
173
+ name_hash['Fnord-internal'][:parameters].should == ['Q', 'q']
174
+ end
175
+
176
+ it "should add a single parameter to the name_hash" do
177
+ parser.add_referer('internal', 'Fnord', 'fnord.com', 'q')
178
+ name_hash['Fnord-internal'][:parameters].should == ['q']
179
+ end
180
+ end
181
+ end
@@ -0,0 +1,35 @@
1
+ # Copyright (c) 2014 Inside Systems, Inc All rights reserved.
2
+ #
3
+ # This program is licensed to you under the Apache License Version 2.0,
4
+ # and you may not use this file except in compliance with the Apache License Version 2.0.
5
+ # You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
6
+ #
7
+ # Unless required by applicable law or agreed to in writing,
8
+ # software distributed under the Apache License Version 2.0 is distributed on an
9
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
11
+
12
+ # Author:: Kelley Reynolds (mailto:kelley@insidesystems.net)
13
+ # Copyright:: Copyright (c) 2014 Inside Systems, Inc
14
+ # License:: Apache License Version 2.0
15
+
16
+
17
+ require 'rubygems'
18
+ require 'bundler'
19
+ Bundler.setup(:default, :test)
20
+
21
+ require 'yaml'
22
+ require 'rspec'
23
+ require 'referer-parser'
24
+ require 'uri'
25
+ require 'json'
26
+
27
+ module Helpers
28
+ def fixture(filename)
29
+ File.join(File.dirname(__FILE__), 'fixtures', filename)
30
+ end
31
+ end
32
+
33
+ RSpec.configure do |config|
34
+ config.include Helpers
35
+ end
metadata CHANGED
@@ -1,16 +1,17 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: referer-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yali Sassoon
8
8
  - Martin Loy
9
9
  - Alex Dean
10
+ - Kelley Reynolds
10
11
  autorequire:
11
12
  bindir: bin
12
13
  cert_chain: []
13
- date: 2014-06-26 00:00:00.000000000 Z
14
+ date: 2014-09-03 00:00:00.000000000 Z
14
15
  dependencies:
15
16
  - !ruby/object:Gem::Dependency
16
17
  name: rspec
@@ -26,6 +27,20 @@ dependencies:
26
27
  - - ~>
27
28
  - !ruby/object:Gem::Version
28
29
  version: '2.6'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - ! '>='
35
+ - !ruby/object:Gem::Version
36
+ version: 0.9.2
37
+ type: :development
38
+ prerelease: false
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: 0.9.2
29
44
  description: Library for extracting marketing attribution data from referer URLs
30
45
  email:
31
46
  - support@snowplowanalytics.com
@@ -42,12 +57,15 @@ files:
42
57
  - data/referers.yml
43
58
  - lib/referer-parser.rb
44
59
  - lib/referer-parser/errors.rb
45
- - lib/referer-parser/referer.rb
46
- - lib/referer-parser/referers.rb
60
+ - lib/referer-parser/parser.rb
47
61
  - lib/referer-parser/version.rb
48
62
  - referer-parser.gemspec
49
- - spec/referer-spec.rb
50
- - spec/referer-tests.json
63
+ - spec/fixtures/internal.json
64
+ - spec/fixtures/invalid.json
65
+ - spec/fixtures/invalid.yml
66
+ - spec/fixtures/referer-tests.json
67
+ - spec/parser_spec.rb
68
+ - spec/spec_helper.rb
51
69
  homepage: http://github.com/snowplow/referer-parser
52
70
  licenses: []
53
71
  metadata: {}
@@ -67,7 +85,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
67
85
  version: '0'
68
86
  requirements: []
69
87
  rubyforge_project:
70
- rubygems_version: 2.2.2
88
+ rubygems_version: 2.1.11
71
89
  signing_key:
72
90
  specification_version: 4
73
91
  summary: Library for extracting marketing attribution data (e.g. search terms) from
@@ -75,5 +93,9 @@ summary: Library for extracting marketing attribution data (e.g. search terms) f
75
93
  Our hope is that this library (and referers.yml) will be extended by anyone interested
76
94
  in parsing referer URLs.
77
95
  test_files:
78
- - spec/referer-spec.rb
79
- - spec/referer-tests.json
96
+ - spec/fixtures/internal.json
97
+ - spec/fixtures/invalid.json
98
+ - spec/fixtures/invalid.yml
99
+ - spec/fixtures/referer-tests.json
100
+ - spec/parser_spec.rb
101
+ - spec/spec_helper.rb
@@ -1,118 +0,0 @@
1
- # Copyright (c) 2012-2013 Snowplow Analytics Ltd. All rights reserved.
2
- #
3
- # This program is licensed to you under the Apache License Version 2.0,
4
- # and you may not use this file except in compliance with the Apache License Version 2.0.
5
- # You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
6
- #
7
- # Unless required by applicable law or agreed to in writing,
8
- # software distributed under the Apache License Version 2.0 is distributed on an
9
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
- # See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
11
-
12
- # Author:: Yali Sassoon (mailto:support@snowplowanalytics.com)
13
- # Copyright:: Copyright (c) 2012-2013 Snowplow Analytics Ltd
14
- # License:: Apache License Version 2.0
15
-
16
- require 'uri'
17
- require 'cgi'
18
-
19
- module RefererParser
20
- class Referer
21
-
22
- attr_reader :uri,
23
- :known,
24
- :referer,
25
- :search_parameter,
26
- :search_term
27
-
28
- # So can be interrogated with .known? too.
29
- alias_method :known?, :known
30
-
31
- def parse(referer_url)
32
- @uri = Referer::parse_uri(referer_url)
33
-
34
- referer = Referers::get_referer(@uri)
35
- unless referer.nil?
36
- @known = true
37
- @referer = referer['name']
38
- @search_parameter, @search_term = Referer::extract_search(@uri, referer['parameters'])
39
- else
40
- @known = false
41
- @referer, @search_parameter, @search_term = nil # Being explicit
42
- end
43
- end
44
-
45
- private # -------------------------------------------------------------
46
-
47
- # Static method to turn a `raw_url`
48
- # into a URI, checking that it's
49
- # a HTTP(S) URI. Supports raw
50
- # string and existing URI
51
- def self.parse_uri(raw_url)
52
-
53
- uri = if raw_url.is_a? String
54
- begin
55
- URI.parse(raw_url)
56
- rescue => error
57
- raise InvalidUriError, error.message
58
- end
59
- elsif raw_url.is_a? URI
60
- raw_url
61
- else
62
- raise InvalidUriError, "'#{raw_url}' must be a String or URI"
63
- end
64
-
65
- unless %w( http https ).include?(uri.scheme)
66
- raise InvalidUriError, "'#{raw_url}' is not an http(s) protocol URI"
67
- end
68
- uri
69
- end
70
-
71
- # Static method to get the keywords from a `uri`,
72
- # where keywords are stored against one of the
73
- # `possible_parameters` in the querystring.
74
- # Returns a 'tuple' of the parameter found plus
75
- # the keywords.
76
- def self.extract_search(uri, possible_parameters)
77
- param = nil
78
-
79
- # Only get keywords if there's a query string to extract them from...
80
- if uri.query
81
- parameters = CGI.parse(uri.query)
82
-
83
- # Try each possible keyword parameter with the querystring until one returns a result
84
- possible_parameters.each do | pp |
85
- if parameters.has_key?(pp)
86
- param = pp
87
- parameters[pp].each do |result|
88
- unless result == ""
89
- return [pp, result] # return first value not eql ""
90
- end
91
- end
92
- end
93
- end
94
- end
95
-
96
- return [param, []] # No parameter or keywords to return
97
- end
98
-
99
- # Constructor. Takes the `referer_url`
100
- # to extract the referer from (can be
101
- # a String or URI)
102
- #
103
- # Optionaly it takes the `referer_file` param
104
- # to use instead of the bundle referers.yml
105
- # (must be a yaml file)
106
- def initialize(referer_url, referer_file = nil)
107
-
108
- if referer_file.nil?
109
- Referers::load_referers_from_yaml(Referers::get_yaml_file())
110
- else
111
- Referers::load_referers_from_yaml(Referers::get_yaml_file(referer_file))
112
- end
113
-
114
- parse(referer_url)
115
-
116
- end
117
- end
118
- end
@@ -1,92 +0,0 @@
1
- # Copyright (c) 2012-2013 Snowplow Analytics Ltd. All rights reserved.
2
- #
3
- # This program is licensed to you under the Apache License Version 2.0,
4
- # and you may not use this file except in compliance with the Apache License Version 2.0.
5
- # You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
6
- #
7
- # Unless required by applicable law or agreed to in writing,
8
- # software distributed under the Apache License Version 2.0 is distributed on an
9
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
- # See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
11
-
12
- # Author:: Yali Sassoon (mailto:support@snowplowanalytics.com)
13
- # Copyright:: Copyright (c) 2012-2013 Snowplow Analytics Ltd
14
- # License:: Apache License Version 2.0
15
-
16
- require 'yaml'
17
-
18
- # This module processes the referers.yml file and
19
- # uses it to create a global hash that is used to
20
- # lookup URLs to see if they are known referers
21
- # (e.g. search engines)
22
- module RefererParser
23
- module Referers
24
-
25
- # Returns the referer indicated by
26
- # the given `uri`
27
- def self.get_referer(uri)
28
- # Check if domain+path matches (e.g. google.co.uk/products)
29
- referer = @referers[uri.host + uri.path]
30
- if referer.nil?
31
- # Check if domain only matches (e.g. google.co.uk)
32
- referer = @referers[uri.host]
33
- end
34
- referer
35
- end
36
-
37
- private # -------------------------------------------------------------
38
-
39
- # Returns the path to the YAML
40
- # file of referers
41
- def self.get_yaml_file(referer_file = nil)
42
- if referer_file.nil?
43
- File.join(File.dirname(__FILE__), '..', '..', 'data', 'referers.yml')
44
- else
45
- referer_file
46
- end
47
- end
48
-
49
- # Initializes a hash of referers
50
- # from the supplied YAML file
51
- def self.load_referers_from_yaml(yaml_file)
52
- return if @loaded_file == yaml_file
53
- unless File.exist?(yaml_file) and File.file?(yaml_file)
54
- raise ReferersYamlNotFoundError, "Could not find referers YAML file at '#{yaml_file}'"
55
- end
56
-
57
- # Load referer data stored in YAML file
58
- begin
59
- yaml = YAML.load_file(yaml_file)['search'] # TODO: fix this when we support the other types
60
- rescue error
61
- raise CorruptReferersYamlError.new("Could not parse referers YAML file '#{yaml_file}'", error)
62
- end
63
- @referers = load_referers(yaml)
64
- @loaded_file = yaml_file
65
- end
66
-
67
- # Validate and expand the `raw_referers`
68
- # array, building a hash of referers as
69
- # we go
70
- def self.load_referers(raw_referers)
71
-
72
- # Validate the YAML file, building the lookup
73
- # hash of referer domains as we go
74
- referers = Hash.new
75
- raw_referers.each { | referer, data |
76
- if data['parameters'].nil?
77
- raise CorruptReferersYamlError, "No parameters found for referer '#{referer}'"
78
- end
79
- if data['domains'].nil?
80
- raise CorruptReferersYamlError, "No domains found for referer '#{referer}'"
81
- end
82
-
83
- data['domains'].each do | domain |
84
- domain_pair = { domain => { "name" => referer,
85
- "parameters" => data['parameters']}}
86
- referers.merge!(domain_pair)
87
- end
88
- }
89
- return referers
90
- end
91
- end
92
- end
@@ -1,92 +0,0 @@
1
- # Copyright (c) 2012-2013 Snowplow Analytics Ltd. All rights reserved.
2
- #
3
- # This program is licensed to you under the Apache License Version 2.0,
4
- # and you may not use this file except in compliance with the Apache License Version 2.0.
5
- # You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
6
- #
7
- # Unless required by applicable law or agreed to in writing,
8
- # software distributed under the Apache License Version 2.0 is distributed on an
9
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
- # See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
11
-
12
- # Author:: Yali Sassoon (mailto:support@snowplowanalytics.com)
13
- # Copyright:: Copyright (c) 2012-2013 Snowplow Analytics Ltd
14
- # License:: Apache License Version 2.0
15
-
16
- require 'referer-parser'
17
- require 'uri'
18
-
19
- describe RefererParser::Referer do
20
-
21
- GOOGLE_COM_REFERER = 'http://www.google.com/search?q=gateway+oracle+cards+denise+linn&hl=en&client=safari&tbo=d&biw=768&bih=900&source=lnms&tbm=isch&ei=t9fTT_TFEYb28gTtg9HZAw&sa=X&oi=mode_link&ct=mode&cd=2&sqi=2&ved=0CEUQ_AUoAQ'
22
- GOOGLE_CO_UK_REFERER = 'http://www.google.co.uk/search?hl=en&client=safari&q=psychic+bazaar&oq=psychic+bazaa&aq=0&aqi=g1&aql=&gs_l=mobile-gws-serp.1.0.0.61498.64599.0.66559.12.9.1.1.2.2.2407.10525.6-2j0j1j3.6.0...0.0.DiYO_7K_ndg&mvs=0'
23
- FACEBOOK_COM_REFERER = 'http://www.facebook.com/l.php?u=http%3A%2F%2Fpsy.bz%2FLtPadV&h=MAQHYFyRRAQFzmokHhn3w4LGWVzjs7YwZGejw7Up5TqNHIw'
24
- TRUNCATED_REFERER = 'http://googleads.g.doubleclick.net/pagead/ads?client=ca-pub-9108147844898389&output=html&h=60&slotname=1720218904&w=468&lmt=1368485108&flash=11.7.700.169&url=http%3A%2F%2Fwww.bsaving.com%2Fprintable-online-target-coupons%3Futm_source%3Dbsaving_new_Email%2'
25
-
26
- it "Should be initializable with an external referers.yml" do
27
- external_referer = File.join(File.dirname(__FILE__), '..', 'data', 'referers.yml') # Using the bundled referers.yml in fact
28
- uri = URI.parse(GOOGLE_COM_REFERER)
29
- r = RefererParser::Referer.new(uri, external_referer)
30
- r.referer.should eql "Google"
31
- end
32
-
33
- it "Should be initializable without an external referers.yml" do
34
- uri = URI.parse(GOOGLE_COM_REFERER)
35
- r = RefererParser::Referer.new(uri)
36
- r.referer.should eql "Google"
37
- end
38
-
39
- it "Should correctly parse a google.com referer URL" do
40
- r = RefererParser::Referer.new(GOOGLE_COM_REFERER)
41
- r.known?.should eql true
42
- r.referer.should eql "Google"
43
- r.search_parameter.should eql "q"
44
- r.search_term.should eql "gateway oracle cards denise linn"
45
- r.uri.host.should eql "www.google.com"
46
- end
47
-
48
- it "Should correctly extract a google.co.uk search term" do
49
- r = RefererParser::Referer.new(GOOGLE_CO_UK_REFERER)
50
- r.search_term.should eql "psychic bazaar"
51
- end
52
-
53
- it "Should not identify Facebook as a known referer" do
54
- r = RefererParser::Referer.new(FACEBOOK_COM_REFERER)
55
- r.known?.should eql false
56
- end
57
-
58
- it "Should be initializable with an existing URI object" do
59
- uri = URI.parse(GOOGLE_COM_REFERER)
60
- r = RefererParser::Referer.new(uri)
61
- r.referer.should eql "Google"
62
- end
63
-
64
- it "Should be possible to re-use a Referer object" do
65
- r = RefererParser::Referer.new(GOOGLE_CO_UK_REFERER)
66
- r.search_term.should eql "psychic bazaar"
67
- r.parse(GOOGLE_COM_REFERER)
68
- r.search_term.should eql "gateway oracle cards denise linn"
69
- r.uri.host.should eql "www.google.com"
70
- end
71
-
72
- it "Should return the better result when the referer contains two or more parameters" do
73
- referer_contains_two_params = "http://search.tiscali.it/?tiscalitype=web&collection=web&q=&key=hello"
74
- r = RefererParser::Referer.new(referer_contains_two_params)
75
- r.search_term.should eql "hello"
76
- r.search_parameter.should eql "key"
77
- end
78
-
79
- it "Should return the better result when the referer contains same parameters" do
80
- referer_contains_two_params = "http://search.tiscali.it/?tiscalitype=web&collection=web&key=&key=hello"
81
- r = RefererParser::Referer.new(referer_contains_two_params)
82
- r.search_term.should eql "hello"
83
- r.search_parameter.should eql "key"
84
- end
85
-
86
- it "should raise InvalidUriError on a truncated Uri" do
87
- expect{
88
- r = RefererParser::Referer.new(TRUNCATED_REFERER)
89
- }.to raise_error(RefererParser::InvalidUriError)
90
- end
91
-
92
- end