domain_parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 981dcebc546d3279b66c6372081c461692b11430
4
+ data.tar.gz: bb604a0ed4f4132408f4e680e36e114d9e5ab352
5
+ SHA512:
6
+ metadata.gz: 9fd996618d6de595a93db5177c0fee669130facf56540261a29aecfc2358b1e4f04bbec29da1e1b539acbcd6463619a442380d4cbb4be3e80efaa5c27b8c1346
7
+ data.tar.gz: fd2004de31afc48d9bf16144a79a1bde1962bcfcfcf790dd24182241d36a9dfa1458075d773c818ea7556075c98bf85aa599c3dab048686014bb5c07f5050d8d
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in domain_parser.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,26 @@
1
+ # DomainParser
2
+
3
+ This gem is used to parse domain of url and uses a lru hash as cache to speed the parse. Its code comes from scrappy.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'domain_parser'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install domain_parser
20
+
21
+ ## Usage
22
+
23
+ ```ruby
24
+ parser = DomainParser.new
25
+ parser.domain('http://pdx.eater.com/venue/paiche')
26
+ ```
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "domain_parser"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,3 @@
1
+ {
2
+ "cache_size": 100000
3
+ }
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'domain_parser/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'domain_parser'
8
+ spec.version = DomainParser::VERSION
9
+ spec.authors = ['Ryan Yang']
10
+ spec.email = ['ryan@factual.com']
11
+
12
+ spec.summary = %q{Gem for parsing domain of url}
13
+ spec.homepage = 'https://github.com/Factual/data-projects/tree/develop/lib/domain_parser'
14
+
15
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
16
+ f.match(%r{^(test|spec|features)/})
17
+ end
18
+ spec.bindir = 'exe'
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ['lib']
21
+
22
+ spec.add_runtime_dependency 'public_suffix', '~> 2.0', '>= 2.0.4'
23
+ spec.add_runtime_dependency 'addressable', '~> 2.5', '>= 2.5.0'
24
+ spec.add_runtime_dependency 'lru_redux', '~> 1.1', '>= 1.1.0'
25
+
26
+ spec.add_development_dependency 'bundler', '~> 1.13'
27
+ spec.add_development_dependency 'rake', '~> 10.0'
28
+ spec.add_development_dependency 'rspec', '~> 3.0'
29
+ end
@@ -0,0 +1,3 @@
1
+ class DomainParser
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,58 @@
1
+ require 'json'
2
+ require 'logger'
3
+ require 'lru_redux'
4
+ require 'public_suffix'
5
+ require 'addressable/uri'
6
+
7
+ class DomainParser
8
+ FILES = %w(
9
+ domain_parser/version.rb
10
+ )
11
+
12
+ BASE_DIR = File.expand_path('../../', __FILE__)
13
+ LIB_DIR = File.join(BASE_DIR, 'lib')
14
+ FILES.each { |file| require File.join(LIB_DIR, file) }
15
+
16
+ CONFIG = JSON.parse(File.read(File.join(BASE_DIR, 'config', 'defaults.json')), symbolize_names: true)
17
+
18
+ LOGGER = Logger.new(STDERR)
19
+ LOGGER.level = Logger::INFO
20
+
21
+ def initialize(args = {})
22
+ @opts = CONFIG.merge(args)
23
+ @cache = LruRedux::Cache.new(@opts[:cache_size])
24
+ end
25
+
26
+ def host(url)
27
+
28
+ # rescue from ip address with port, like 10.20.10.127:1234
29
+ # which will fail due to invalid sheme 10.20.10.127
30
+ uri = Addressable::URI.parse(url) rescue Addressable::URI.parse('http://' + url)
31
+ uri = Addressable::URI.parse('http://' + url) if uri.scheme.nil?
32
+ uri.host ? uri.host : url
33
+ end
34
+
35
+ def domain(url)
36
+ return nil if url.nil? || url =~ /javascript:|mailto:/i
37
+ url = url.strip
38
+ domain = nil
39
+ host = host(url)
40
+
41
+ # do nothing if it's an ip
42
+ if host !~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/
43
+ cached_domain = @cache[host]
44
+ return cached_domain unless cached_domain.nil?
45
+
46
+ begin
47
+ domain = PublicSuffix.parse(host).domain
48
+ @cache[host] = domain || host if host
49
+ rescue PublicSuffix::DomainInvalid => e
50
+ LOGGER.warn "#{e.message} parsing root domain from host #{url.inspect}"
51
+ rescue StandardError => e
52
+ LOGGER.error "#{e.message} parsing root domain from host #{url.inspect}"
53
+ end
54
+ end
55
+
56
+ domain || host
57
+ end
58
+ end
metadata ADDED
@@ -0,0 +1,155 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: domain_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ryan Yang
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-01-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: public_suffix
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 2.0.4
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '2.0'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 2.0.4
33
+ - !ruby/object:Gem::Dependency
34
+ name: addressable
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '2.5'
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 2.5.0
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '2.5'
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 2.5.0
53
+ - !ruby/object:Gem::Dependency
54
+ name: lru_redux
55
+ requirement: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - "~>"
58
+ - !ruby/object:Gem::Version
59
+ version: '1.1'
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 1.1.0
63
+ type: :runtime
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: '1.1'
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: 1.1.0
73
+ - !ruby/object:Gem::Dependency
74
+ name: bundler
75
+ requirement: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - "~>"
78
+ - !ruby/object:Gem::Version
79
+ version: '1.13'
80
+ type: :development
81
+ prerelease: false
82
+ version_requirements: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - "~>"
85
+ - !ruby/object:Gem::Version
86
+ version: '1.13'
87
+ - !ruby/object:Gem::Dependency
88
+ name: rake
89
+ requirement: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - "~>"
92
+ - !ruby/object:Gem::Version
93
+ version: '10.0'
94
+ type: :development
95
+ prerelease: false
96
+ version_requirements: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - "~>"
99
+ - !ruby/object:Gem::Version
100
+ version: '10.0'
101
+ - !ruby/object:Gem::Dependency
102
+ name: rspec
103
+ requirement: !ruby/object:Gem::Requirement
104
+ requirements:
105
+ - - "~>"
106
+ - !ruby/object:Gem::Version
107
+ version: '3.0'
108
+ type: :development
109
+ prerelease: false
110
+ version_requirements: !ruby/object:Gem::Requirement
111
+ requirements:
112
+ - - "~>"
113
+ - !ruby/object:Gem::Version
114
+ version: '3.0'
115
+ description:
116
+ email:
117
+ - ryan@factual.com
118
+ executables: []
119
+ extensions: []
120
+ extra_rdoc_files: []
121
+ files:
122
+ - ".rspec"
123
+ - Gemfile
124
+ - README.md
125
+ - Rakefile
126
+ - bin/console
127
+ - bin/setup
128
+ - config/defaults.json
129
+ - domain_parser.gemspec
130
+ - lib/domain_parser.rb
131
+ - lib/domain_parser/version.rb
132
+ homepage: https://github.com/Factual/data-projects/tree/develop/lib/domain_parser
133
+ licenses: []
134
+ metadata: {}
135
+ post_install_message:
136
+ rdoc_options: []
137
+ require_paths:
138
+ - lib
139
+ required_ruby_version: !ruby/object:Gem::Requirement
140
+ requirements:
141
+ - - ">="
142
+ - !ruby/object:Gem::Version
143
+ version: '0'
144
+ required_rubygems_version: !ruby/object:Gem::Requirement
145
+ requirements:
146
+ - - ">="
147
+ - !ruby/object:Gem::Version
148
+ version: '0'
149
+ requirements: []
150
+ rubyforge_project:
151
+ rubygems_version: 2.6.6
152
+ signing_key:
153
+ specification_version: 4
154
+ summary: Gem for parsing domain of url
155
+ test_files: []