domain_parser 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 981dcebc546d3279b66c6372081c461692b11430
4
+ data.tar.gz: bb604a0ed4f4132408f4e680e36e114d9e5ab352
5
+ SHA512:
6
+ metadata.gz: 9fd996618d6de595a93db5177c0fee669130facf56540261a29aecfc2358b1e4f04bbec29da1e1b539acbcd6463619a442380d4cbb4be3e80efaa5c27b8c1346
7
+ data.tar.gz: fd2004de31afc48d9bf16144a79a1bde1962bcfcfcf790dd24182241d36a9dfa1458075d773c818ea7556075c98bf85aa599c3dab048686014bb5c07f5050d8d
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in domain_parser.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,26 @@
1
+ # DomainParser
2
+
3
+ This gem is used to parse domain of url and uses a lru hash as cache to speed the parse. Its code comes from scrappy.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'domain_parser'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install domain_parser
20
+
21
+ ## Usage
22
+
23
+ ```ruby
24
+ parser = DomainParser.new
25
+ parser.domain('http://pdx.eater.com/venue/paiche')
26
+ ```
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "domain_parser"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,3 @@
1
+ {
2
+ "cache_size": 100000
3
+ }
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'domain_parser/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'domain_parser'
8
+ spec.version = DomainParser::VERSION
9
+ spec.authors = ['Ryan Yang']
10
+ spec.email = ['ryan@factual.com']
11
+
12
+ spec.summary = %q{Gem for parsing domain of url}
13
+ spec.homepage = 'https://github.com/Factual/data-projects/tree/develop/lib/domain_parser'
14
+
15
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
16
+ f.match(%r{^(test|spec|features)/})
17
+ end
18
+ spec.bindir = 'exe'
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ['lib']
21
+
22
+ spec.add_runtime_dependency 'public_suffix', '~> 2.0', '>= 2.0.4'
23
+ spec.add_runtime_dependency 'addressable', '~> 2.5', '>= 2.5.0'
24
+ spec.add_runtime_dependency 'lru_redux', '~> 1.1', '>= 1.1.0'
25
+
26
+ spec.add_development_dependency 'bundler', '~> 1.13'
27
+ spec.add_development_dependency 'rake', '~> 10.0'
28
+ spec.add_development_dependency 'rspec', '~> 3.0'
29
+ end
@@ -0,0 +1,3 @@
1
+ class DomainParser
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,58 @@
1
+ require 'json'
2
+ require 'logger'
3
+ require 'lru_redux'
4
+ require 'public_suffix'
5
+ require 'addressable/uri'
6
+
7
+ class DomainParser
8
+ FILES = %w(
9
+ domain_parser/version.rb
10
+ )
11
+
12
+ BASE_DIR = File.expand_path('../../', __FILE__)
13
+ LIB_DIR = File.join(BASE_DIR, 'lib')
14
+ FILES.each { |file| require File.join(LIB_DIR, file) }
15
+
16
+ CONFIG = JSON.parse(File.read(File.join(BASE_DIR, 'config', 'defaults.json')), symbolize_names: true)
17
+
18
+ LOGGER = Logger.new(STDERR)
19
+ LOGGER.level = Logger::INFO
20
+
21
+ def initialize(args = {})
22
+ @opts = CONFIG.merge(args)
23
+ @cache = LruRedux::Cache.new(@opts[:cache_size])
24
+ end
25
+
26
+ def host(url)
27
+
28
+ # rescue from ip address with port, like 10.20.10.127:1234
29
+ # which will fail due to invalid sheme 10.20.10.127
30
+ uri = Addressable::URI.parse(url) rescue Addressable::URI.parse('http://' + url)
31
+ uri = Addressable::URI.parse('http://' + url) if uri.scheme.nil?
32
+ uri.host ? uri.host : url
33
+ end
34
+
35
+ def domain(url)
36
+ return nil if url.nil? || url =~ /javascript:|mailto:/i
37
+ url = url.strip
38
+ domain = nil
39
+ host = host(url)
40
+
41
+ # do nothing if it's an ip
42
+ if host !~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/
43
+ cached_domain = @cache[host]
44
+ return cached_domain unless cached_domain.nil?
45
+
46
+ begin
47
+ domain = PublicSuffix.parse(host).domain
48
+ @cache[host] = domain || host if host
49
+ rescue PublicSuffix::DomainInvalid => e
50
+ LOGGER.warn "#{e.message} parsing root domain from host #{url.inspect}"
51
+ rescue StandardError => e
52
+ LOGGER.error "#{e.message} parsing root domain from host #{url.inspect}"
53
+ end
54
+ end
55
+
56
+ domain || host
57
+ end
58
+ end
metadata ADDED
@@ -0,0 +1,155 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: domain_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ryan Yang
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-01-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: public_suffix
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.0'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 2.0.4
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '2.0'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 2.0.4
33
+ - !ruby/object:Gem::Dependency
34
+ name: addressable
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '2.5'
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 2.5.0
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '2.5'
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 2.5.0
53
+ - !ruby/object:Gem::Dependency
54
+ name: lru_redux
55
+ requirement: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - "~>"
58
+ - !ruby/object:Gem::Version
59
+ version: '1.1'
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 1.1.0
63
+ type: :runtime
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: '1.1'
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: 1.1.0
73
+ - !ruby/object:Gem::Dependency
74
+ name: bundler
75
+ requirement: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - "~>"
78
+ - !ruby/object:Gem::Version
79
+ version: '1.13'
80
+ type: :development
81
+ prerelease: false
82
+ version_requirements: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - "~>"
85
+ - !ruby/object:Gem::Version
86
+ version: '1.13'
87
+ - !ruby/object:Gem::Dependency
88
+ name: rake
89
+ requirement: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - "~>"
92
+ - !ruby/object:Gem::Version
93
+ version: '10.0'
94
+ type: :development
95
+ prerelease: false
96
+ version_requirements: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - "~>"
99
+ - !ruby/object:Gem::Version
100
+ version: '10.0'
101
+ - !ruby/object:Gem::Dependency
102
+ name: rspec
103
+ requirement: !ruby/object:Gem::Requirement
104
+ requirements:
105
+ - - "~>"
106
+ - !ruby/object:Gem::Version
107
+ version: '3.0'
108
+ type: :development
109
+ prerelease: false
110
+ version_requirements: !ruby/object:Gem::Requirement
111
+ requirements:
112
+ - - "~>"
113
+ - !ruby/object:Gem::Version
114
+ version: '3.0'
115
+ description:
116
+ email:
117
+ - ryan@factual.com
118
+ executables: []
119
+ extensions: []
120
+ extra_rdoc_files: []
121
+ files:
122
+ - ".rspec"
123
+ - Gemfile
124
+ - README.md
125
+ - Rakefile
126
+ - bin/console
127
+ - bin/setup
128
+ - config/defaults.json
129
+ - domain_parser.gemspec
130
+ - lib/domain_parser.rb
131
+ - lib/domain_parser/version.rb
132
+ homepage: https://github.com/Factual/data-projects/tree/develop/lib/domain_parser
133
+ licenses: []
134
+ metadata: {}
135
+ post_install_message:
136
+ rdoc_options: []
137
+ require_paths:
138
+ - lib
139
+ required_ruby_version: !ruby/object:Gem::Requirement
140
+ requirements:
141
+ - - ">="
142
+ - !ruby/object:Gem::Version
143
+ version: '0'
144
+ required_rubygems_version: !ruby/object:Gem::Requirement
145
+ requirements:
146
+ - - ">="
147
+ - !ruby/object:Gem::Version
148
+ version: '0'
149
+ requirements: []
150
+ rubyforge_project:
151
+ rubygems_version: 2.6.6
152
+ signing_key:
153
+ specification_version: 4
154
+ summary: Gem for parsing domain of url
155
+ test_files: []