fluffix 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +92 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +22 -0
- data/Rakefile +48 -0
- data/VERSION +1 -0
- data/entity_types_edited.yml +930 -0
- data/features/association.feature +0 -0
- data/features/bank.feature +13 -0
- data/features/company.feature +19 -0
- data/features/cooperative.feature +15 -0
- data/features/corporation.feature +47 -0
- data/features/inc.feature +12 -0
- data/features/l3c.feature +15 -0
- data/features/ltd.feature +15 -0
- data/features/pac.feature +14 -0
- data/features/plc.feature +17 -0
- data/features/sc.feature +14 -0
- data/features/simple.feature +26 -0
- data/features/spa.feature +12 -0
- data/features/step_definitions/sluffix_steps.rb +24 -0
- data/features/support/env.rb +29 -0
- data/lib/fluffix.rb +21 -0
- data/lib/tasks/get.rb +86 -0
- data/spec/fluffix_spec.rb +7 -0
- data/spec/spec_helper.rb +29 -0
- metadata +186 -0
File without changes
|
@@ -0,0 +1,13 @@
|
|
1
|
+
Feature: cleanse a company name of suffix
|
2
|
+
In order to have clean company names that we could compare
|
3
|
+
|
4
|
+
Scenario: I have bank's
|
5
|
+
Given I have these names
|
6
|
+
| Prototype |
|
7
|
+
| Prototype Interactive Laboratories (dead in '99) |
|
8
|
+
And I have these suffixes
|
9
|
+
| Bank |
|
10
|
+
| Banking |
|
11
|
+
| Bankers |
|
12
|
+
And I combine them
|
13
|
+
Then the cleansed version should match the name
|
@@ -0,0 +1,19 @@
|
|
1
|
+
Feature: cleanse a company name of suffix
|
2
|
+
In order to have clean company names that we could compare
|
3
|
+
|
4
|
+
Scenario: I have bank's
|
5
|
+
Given I have these names
|
6
|
+
| Prototype |
|
7
|
+
| Prototype Interactive Laboratories (dead in '99) |
|
8
|
+
And I have these suffixes
|
9
|
+
| and Company |
|
10
|
+
| and Co |
|
11
|
+
| & Company |
|
12
|
+
| & Co |
|
13
|
+
| + Company |
|
14
|
+
| + Co |
|
15
|
+
| Trust Company |
|
16
|
+
| Trust Co |
|
17
|
+
|
18
|
+
And I combine them
|
19
|
+
Then the cleansed version should match the name
|
@@ -0,0 +1,15 @@
|
|
1
|
+
Feature: cleanse a company name of suffix
|
2
|
+
In order to have clean company names that we could compare
|
3
|
+
|
4
|
+
Scenario: I have Coop
|
5
|
+
Given I have these names
|
6
|
+
| Prototype |
|
7
|
+
| Prototype Interactive Laboratories (dead in '99) |
|
8
|
+
And I have these suffixes
|
9
|
+
| Cooperative |
|
10
|
+
| Co-operative |
|
11
|
+
| Coop |
|
12
|
+
| Co-op |
|
13
|
+
|
14
|
+
And I combine them
|
15
|
+
Then the cleansed version should match the name
|
@@ -0,0 +1,47 @@
|
|
1
|
+
Feature: cleanse a company name of suffix
|
2
|
+
In order to have clean company names that we could compare
|
3
|
+
|
4
|
+
Scenario: I have Corporations
|
5
|
+
Given I have these names
|
6
|
+
| Prototype |
|
7
|
+
| Prototype Interactive Laboratories (dead in '99) |
|
8
|
+
And I have these suffixes
|
9
|
+
| Corporation |
|
10
|
+
| Corp |
|
11
|
+
| A Corp |
|
12
|
+
| A Professional Corp |
|
13
|
+
| A Pro Corp |
|
14
|
+
| A Pro. Corp |
|
15
|
+
| A Pro. Corporation |
|
16
|
+
| A Chartered Corp |
|
17
|
+
| A Chartered Corporation |
|
18
|
+
| A Service Corp |
|
19
|
+
| A Service Corporation |
|
20
|
+
| A Svc Corp |
|
21
|
+
| A Svc. Corp |
|
22
|
+
| A Svc Corporation |
|
23
|
+
| A Svc. Corporation |
|
24
|
+
| A Nonprofit Corp |
|
25
|
+
| A Non Profit Corp |
|
26
|
+
| A Not for Profit Corp |
|
27
|
+
| Professional Corp |
|
28
|
+
| Pro Corp |
|
29
|
+
| Pro. Corp |
|
30
|
+
| Pro. Corporation |
|
31
|
+
| Chartered Corp |
|
32
|
+
| Chartered Corporation |
|
33
|
+
| Service Corp |
|
34
|
+
| Service Corporation |
|
35
|
+
| Svc Corp |
|
36
|
+
| Svc. Corp |
|
37
|
+
| Svc Corporation |
|
38
|
+
| Svc. Corporation |
|
39
|
+
| Nonprofit Corp |
|
40
|
+
| Non Profit Corp |
|
41
|
+
| Not for Profit Corp |
|
42
|
+
| Nonprofit Corporation |
|
43
|
+
| Non Profit Corporation |
|
44
|
+
| Not for Profit Corporation |
|
45
|
+
|
46
|
+
And I combine them
|
47
|
+
Then the cleansed version should match the name
|
@@ -0,0 +1,12 @@
|
|
1
|
+
Feature: cleanse a company name of suffix
|
2
|
+
In order to have clean company names that we could compare
|
3
|
+
|
4
|
+
Scenario: I have Inc's
|
5
|
+
Given I have these names
|
6
|
+
| Prototype |
|
7
|
+
| Prototype Interactive Laboratories (dead in '99) |
|
8
|
+
And I have these suffixes
|
9
|
+
| Incorporated |
|
10
|
+
| Inc |
|
11
|
+
And I combine them
|
12
|
+
Then the cleansed version should match the name
|
@@ -0,0 +1,15 @@
|
|
1
|
+
Feature: cleanse a company name of suffix
|
2
|
+
In order to have clean company names that we could compare
|
3
|
+
|
4
|
+
Scenario: I have an L3C
|
5
|
+
Given I have these names
|
6
|
+
| Prototype |
|
7
|
+
| Prototype Interactive Laboratories (dead in '99) |
|
8
|
+
And I have these suffixes
|
9
|
+
| L3C |
|
10
|
+
| L.3.C |
|
11
|
+
| 13C |
|
12
|
+
| 1.3.C |
|
13
|
+
|
14
|
+
And I combine them
|
15
|
+
Then the cleansed version should match the name
|
@@ -0,0 +1,15 @@
|
|
1
|
+
Feature: cleanse a company name of suffix
|
2
|
+
In order to have clean company names that we could compare
|
3
|
+
|
4
|
+
Scenario: I have Ltd
|
5
|
+
Given I have these names
|
6
|
+
| Prototype |
|
7
|
+
| Prototype Interactive Laboratories (dead in '99) |
|
8
|
+
And I have these suffixes
|
9
|
+
| Limited |
|
10
|
+
| LTD |
|
11
|
+
| Limited Liability co |
|
12
|
+
| Limited Liability company |
|
13
|
+
|
14
|
+
And I combine them
|
15
|
+
Then the cleansed version should match the name
|
@@ -0,0 +1,14 @@
|
|
1
|
+
Feature: cleanse a company name of suffix
|
2
|
+
In order to have clean company names that we could compare
|
3
|
+
|
4
|
+
Scenario: I have P.A.C's
|
5
|
+
Given I have these names
|
6
|
+
| Prototype |
|
7
|
+
| Prototype Interactive Laboratories (dead in '99) |
|
8
|
+
And I have these suffixes
|
9
|
+
| PA |
|
10
|
+
| P.A |
|
11
|
+
| PC |
|
12
|
+
| P.C |
|
13
|
+
And I combine them
|
14
|
+
Then the cleansed version should match the name
|
@@ -0,0 +1,17 @@
|
|
1
|
+
Feature: cleanse a company name of suffix
|
2
|
+
In order to have clean company names that we could compare
|
3
|
+
|
4
|
+
Scenario: I have an PLC
|
5
|
+
Given I have these names
|
6
|
+
| Prototype |
|
7
|
+
| Prototype Interactive Laboratories (dead in '99) |
|
8
|
+
And I have these suffixes
|
9
|
+
| P.L.C |
|
10
|
+
| PLC |
|
11
|
+
| P.L.L.C |
|
12
|
+
| PLLC |
|
13
|
+
| L.L.C |
|
14
|
+
| LLC |
|
15
|
+
|
16
|
+
And I combine them
|
17
|
+
Then the cleansed version should match the name
|
data/features/sc.feature
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Feature: cleanse a company name of suffix
|
2
|
+
In order to have clean company names that we could compare
|
3
|
+
|
4
|
+
Scenario: I have S.C's
|
5
|
+
Given I have these names
|
6
|
+
| Prototype |
|
7
|
+
| Prototype Interactive Laboratories (dead in '99) |
|
8
|
+
| prototype |
|
9
|
+
| prototype interactive laboratories (dead in '99) |
|
10
|
+
And I have these suffixes
|
11
|
+
| SC |
|
12
|
+
| S.C |
|
13
|
+
And I combine them
|
14
|
+
Then the cleansed version should match the name
|
@@ -0,0 +1,26 @@
|
|
1
|
+
Feature: cleanse a company name of suffix
|
2
|
+
In order to have clean company names that we could compare
|
3
|
+
|
4
|
+
Scenario: I have Inc's
|
5
|
+
Given I have these names
|
6
|
+
| Prototype |
|
7
|
+
| Prototype Interactive Laboratories (dead in '99) |
|
8
|
+
And I have these suffixes
|
9
|
+
| Deposit |
|
10
|
+
| Credit Union |
|
11
|
+
| Societa Per Azioni |
|
12
|
+
| Club |
|
13
|
+
| Foundation |
|
14
|
+
| Fund |
|
15
|
+
| Institute |
|
16
|
+
| Society |
|
17
|
+
| Union |
|
18
|
+
| Syndicate |
|
19
|
+
| Church |
|
20
|
+
| College |
|
21
|
+
| University |
|
22
|
+
| Chartered |
|
23
|
+
| League |
|
24
|
+
| Committee |
|
25
|
+
And I combine them
|
26
|
+
Then the cleansed version should match the name
|
@@ -0,0 +1,12 @@
|
|
1
|
+
Feature: cleanse a company name of suffix
|
2
|
+
In order to have clean company names that we could compare
|
3
|
+
|
4
|
+
Scenario: I have SPA's
|
5
|
+
Given I have these names
|
6
|
+
| Prototype |
|
7
|
+
| Prototype Interactive Laboratories (dead in '99) |
|
8
|
+
And I have these suffixes
|
9
|
+
| S.P.A |
|
10
|
+
| SPA |
|
11
|
+
And I combine them
|
12
|
+
Then the cleansed version should match the name
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require "fluffix"
|
2
|
+
require "awesome_print"
|
3
|
+
|
4
|
+
# generics
|
5
|
+
Given(/^I have these names$/i) do |names|
|
6
|
+
names = names.raw.flatten
|
7
|
+
@names = names + names.map(&:downcase)
|
8
|
+
end
|
9
|
+
|
10
|
+
Given(/^I have these suffixes$/i) do |suffixes|
|
11
|
+
suffixes = suffixes.raw.flatten
|
12
|
+
@suffixes = suffixes + suffixes.map(&:downcase)
|
13
|
+
@suffixes = @suffixes + @suffixes.map{ |s| "#{s}." }
|
14
|
+
end
|
15
|
+
|
16
|
+
Given(/^I combine them$/i) do
|
17
|
+
@examples = @names.product(@suffixes).map{ |i| [i.join(" "), i.first] }
|
18
|
+
end
|
19
|
+
|
20
|
+
Then(/^The cleansed version should match the name$/i) do
|
21
|
+
@examples.each do |example|
|
22
|
+
Fluffix::US.cleanse(example.first).should eq(example.last)
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'simplecov'
|
2
|
+
|
3
|
+
module SimpleCov::Configuration
|
4
|
+
def clean_filters
|
5
|
+
@filters = []
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
SimpleCov.configure do
|
10
|
+
clean_filters
|
11
|
+
load_adapter 'test_frameworks'
|
12
|
+
end
|
13
|
+
|
14
|
+
ENV["COVERAGE"] && SimpleCov.start do
|
15
|
+
add_filter "/.rvm/"
|
16
|
+
end
|
17
|
+
require 'bundler'
|
18
|
+
begin
|
19
|
+
Bundler.setup(:default, :development)
|
20
|
+
rescue Bundler::BundlerError => e
|
21
|
+
$stderr.puts e.message
|
22
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
23
|
+
exit e.status_code
|
24
|
+
end
|
25
|
+
|
26
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
|
27
|
+
require 'fluffix'
|
28
|
+
|
29
|
+
require 'rspec/expectations'
|
data/lib/fluffix.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require "yaml"
|
2
|
+
require "awesome_print"
|
3
|
+
|
4
|
+
module Fluffix
|
5
|
+
|
6
|
+
RULES = YAML.load(File.open("entity_types_edited.yml"))[0].flatten.uniq.sort{ |a, b| b.size <=> a.size }.map{ |r| Regexp::new(" #{r}\.?$", Regexp::IGNORECASE) }
|
7
|
+
|
8
|
+
class US
|
9
|
+
def self.cleanse(text)
|
10
|
+
raise("Must quack like a string") unless text.respond_to?(:to_s)
|
11
|
+
RULES.each do |rule|
|
12
|
+
# should return the text in the original form and should handle / ignore punctuation
|
13
|
+
if text =~ rule
|
14
|
+
result = text.sub(rule, '')
|
15
|
+
return result
|
16
|
+
end
|
17
|
+
end
|
18
|
+
return text
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/tasks/get.rb
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
# https://en.wikipedia.org/wiki/Types_of_business_entity
|
2
|
+
|
3
|
+
# pull the data, and through a series of techniques, tip out the likely business suffixes
|
4
|
+
require "nokogiri"
|
5
|
+
require "open-uri"
|
6
|
+
require "awesome_print"
|
7
|
+
require "yaml"
|
8
|
+
|
9
|
+
task :basic_get do
|
10
|
+
html = open("https://en.wikipedia.org/wiki/Types_of_business_entity")
|
11
|
+
dom = Nokogiri::HTML.parse(html)
|
12
|
+
|
13
|
+
out_data = []
|
14
|
+
dom.css("ul li").each do |tag|
|
15
|
+
parts = tag.text.split("≈")
|
16
|
+
if parts.length > 1
|
17
|
+
p2 = parts.inject([]) do |r, i|
|
18
|
+
i.split(/[\n]/).each do |j|
|
19
|
+
j.strip!
|
20
|
+
j.downcase!
|
21
|
+
j.sub!(/\(\s?i.e.\s[^\)]+\)/, "")
|
22
|
+
if j.end_with?(":")
|
23
|
+
j.gsub!(/\(([^\)]+)\)/, '~~~\1')
|
24
|
+
j = j.split("~~~").map{ |item| item.sub(/[\s\:]+$/, '') }
|
25
|
+
r << j
|
26
|
+
end
|
27
|
+
end
|
28
|
+
r
|
29
|
+
end
|
30
|
+
out_data << p2
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
File.open("entity_types.yml", "w") do |f|
|
35
|
+
f.write(out_data.to_yaml)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
task :TLD do
|
40
|
+
# https://github.com/alexrabarts/tld
|
41
|
+
# get a list of all TLD's if that one wont work
|
42
|
+
end
|
43
|
+
|
44
|
+
task :open_company_data do
|
45
|
+
# http://index.okfn.org/dataset/companies/
|
46
|
+
# http://download.companieshouse.gov.uk/en_output.html
|
47
|
+
# pull a whole bunch of data in the hopes that it gives us clues about this
|
48
|
+
|
49
|
+
# GET http://download.companieshouse.gov.uk/BasicCompanyData-2015-12-01-part1_5.zip
|
50
|
+
|
51
|
+
# unzip it
|
52
|
+
|
53
|
+
#
|
54
|
+
require "csv"
|
55
|
+
output = Hash.new(0)
|
56
|
+
CSV.foreach("/Users/johnjansen/Downloads/BasicCompanyData-2015-12-01-part1_5.csv", headers:true) do |row|
|
57
|
+
parts = row["CompanyName"].split(" ").map(&:strip)
|
58
|
+
while not parts.empty?
|
59
|
+
output[parts.join(" ")] += 1
|
60
|
+
parts.shift
|
61
|
+
end
|
62
|
+
end
|
63
|
+
ap output.map{ |(a, b)| [a, b * b] }.sort{ |a,b| b[1] <=> a[1] }[0..100]
|
64
|
+
end
|
65
|
+
|
66
|
+
task :edgar do
|
67
|
+
# https://www.sec.gov/edgar/searchedgar/ftpusers.htm
|
68
|
+
# pull all of edgar and figure out the highest frequency suffixes
|
69
|
+
# i.e. for "something incorporated"
|
70
|
+
# 1) reverse it => "incorporated something"
|
71
|
+
# 2) split it => ["incorporated", "something"]
|
72
|
+
# 3) ngram it from longest to shortest =>
|
73
|
+
# ["incorporated", "something"]
|
74
|
+
# ["incorporated"]
|
75
|
+
# 4) go through the entire db and count instances of ngrams =>
|
76
|
+
# ["incorporated", "something"] => 1
|
77
|
+
# ["incorporated"] => 1
|
78
|
+
# 5) suqare the frequencies and trim the long tail
|
79
|
+
# 6) manually clean the result
|
80
|
+
end
|
81
|
+
|
82
|
+
task :dump do
|
83
|
+
File.open("entity_types_edited.yml", "r") do |f|
|
84
|
+
ap YAML.load(f)[0].flatten.uniq.map(&:downcase).sort{ |a, b| b.size <=> a.size }
|
85
|
+
end
|
86
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'simplecov'
|
2
|
+
|
3
|
+
module SimpleCov::Configuration
|
4
|
+
def clean_filters
|
5
|
+
@filters = []
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
SimpleCov.configure do
|
10
|
+
clean_filters
|
11
|
+
load_adapter 'test_frameworks'
|
12
|
+
end
|
13
|
+
|
14
|
+
ENV["COVERAGE"] && SimpleCov.start do
|
15
|
+
add_filter "/.rvm/"
|
16
|
+
end
|
17
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
18
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
19
|
+
|
20
|
+
require 'rspec'
|
21
|
+
require 'fluffix'
|
22
|
+
|
23
|
+
# Requires supporting files with custom matchers and macros, etc,
|
24
|
+
# in ./support/ and its subdirectories.
|
25
|
+
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
26
|
+
|
27
|
+
RSpec.configure do |config|
|
28
|
+
|
29
|
+
end
|