fluffix 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,13 @@
1
+ Feature: cleanse a company name of suffix
2
+ In order to have clean company names that we could compare
3
+
4
+ Scenario: I have bank's
5
+ Given I have these names
6
+ | Prototype |
7
+ | Prototype Interactive Laboratories (dead in '99) |
8
+ And I have these suffixes
9
+ | Bank |
10
+ | Banking |
11
+ | Bankers |
12
+ And I combine them
13
+ Then the cleansed version should match the name
@@ -0,0 +1,19 @@
1
+ Feature: cleanse a company name of suffix
2
+ In order to have clean company names that we could compare
3
+
4
+ Scenario: I have bank's
5
+ Given I have these names
6
+ | Prototype |
7
+ | Prototype Interactive Laboratories (dead in '99) |
8
+ And I have these suffixes
9
+ | and Company |
10
+ | and Co |
11
+ | & Company |
12
+ | & Co |
13
+ | + Company |
14
+ | + Co |
15
+ | Trust Company |
16
+ | Trust Co |
17
+
18
+ And I combine them
19
+ Then the cleansed version should match the name
@@ -0,0 +1,15 @@
1
+ Feature: cleanse a company name of suffix
2
+ In order to have clean company names that we could compare
3
+
4
+ Scenario: I have Coop
5
+ Given I have these names
6
+ | Prototype |
7
+ | Prototype Interactive Laboratories (dead in '99) |
8
+ And I have these suffixes
9
+ | Cooperative |
10
+ | Co-operative |
11
+ | Coop |
12
+ | Co-op |
13
+
14
+ And I combine them
15
+ Then the cleansed version should match the name
@@ -0,0 +1,47 @@
1
+ Feature: cleanse a company name of suffix
2
+ In order to have clean company names that we could compare
3
+
4
+ Scenario: I have Corporations
5
+ Given I have these names
6
+ | Prototype |
7
+ | Prototype Interactive Laboratories (dead in '99) |
8
+ And I have these suffixes
9
+ | Corporation |
10
+ | Corp |
11
+ | A Corp |
12
+ | A Professional Corp |
13
+ | A Pro Corp |
14
+ | A Pro. Corp |
15
+ | A Pro. Corporation |
16
+ | A Chartered Corp |
17
+ | A Chartered Corporation |
18
+ | A Service Corp |
19
+ | A Service Corporation |
20
+ | A Svc Corp |
21
+ | A Svc. Corp |
22
+ | A Svc Corporation |
23
+ | A Svc. Corporation |
24
+ | A Nonprofit Corp |
25
+ | A Non Profit Corp |
26
+ | A Not for Profit Corp |
27
+ | Professional Corp |
28
+ | Pro Corp |
29
+ | Pro. Corp |
30
+ | Pro. Corporation |
31
+ | Chartered Corp |
32
+ | Chartered Corporation |
33
+ | Service Corp |
34
+ | Service Corporation |
35
+ | Svc Corp |
36
+ | Svc. Corp |
37
+ | Svc Corporation |
38
+ | Svc. Corporation |
39
+ | Nonprofit Corp |
40
+ | Non Profit Corp |
41
+ | Not for Profit Corp |
42
+ | Nonprofit Corporation |
43
+ | Non Profit Corporation |
44
+ | Not for Profit Corporation |
45
+
46
+ And I combine them
47
+ Then the cleansed version should match the name
@@ -0,0 +1,12 @@
1
+ Feature: cleanse a company name of suffix
2
+ In order to have clean company names that we could compare
3
+
4
+ Scenario: I have Inc's
5
+ Given I have these names
6
+ | Prototype |
7
+ | Prototype Interactive Laboratories (dead in '99) |
8
+ And I have these suffixes
9
+ | Incorporated |
10
+ | Inc |
11
+ And I combine them
12
+ Then the cleansed version should match the name
@@ -0,0 +1,15 @@
1
+ Feature: cleanse a company name of suffix
2
+ In order to have clean company names that we could compare
3
+
4
+ Scenario: I have an L3C
5
+ Given I have these names
6
+ | Prototype |
7
+ | Prototype Interactive Laboratories (dead in '99) |
8
+ And I have these suffixes
9
+ | L3C |
10
+ | L.3.C |
11
+ | 13C |
12
+ | 1.3.C |
13
+
14
+ And I combine them
15
+ Then the cleansed version should match the name
@@ -0,0 +1,15 @@
1
+ Feature: cleanse a company name of suffix
2
+ In order to have clean company names that we could compare
3
+
4
+ Scenario: I have Ltd
5
+ Given I have these names
6
+ | Prototype |
7
+ | Prototype Interactive Laboratories (dead in '99) |
8
+ And I have these suffixes
9
+ | Limited |
10
+ | LTD |
11
+ | Limited Liability co |
12
+ | Limited Liability company |
13
+
14
+ And I combine them
15
+ Then the cleansed version should match the name
@@ -0,0 +1,14 @@
1
+ Feature: cleanse a company name of suffix
2
+ In order to have clean company names that we could compare
3
+
4
+ Scenario: I have P.A.C's
5
+ Given I have these names
6
+ | Prototype |
7
+ | Prototype Interactive Laboratories (dead in '99) |
8
+ And I have these suffixes
9
+ | PA |
10
+ | P.A |
11
+ | PC |
12
+ | P.C |
13
+ And I combine them
14
+ Then the cleansed version should match the name
@@ -0,0 +1,17 @@
1
+ Feature: cleanse a company name of suffix
2
+ In order to have clean company names that we could compare
3
+
4
+ Scenario: I have an PLC
5
+ Given I have these names
6
+ | Prototype |
7
+ | Prototype Interactive Laboratories (dead in '99) |
8
+ And I have these suffixes
9
+ | P.L.C |
10
+ | PLC |
11
+ | P.L.L.C |
12
+ | PLLC |
13
+ | L.L.C |
14
+ | LLC |
15
+
16
+ And I combine them
17
+ Then the cleansed version should match the name
@@ -0,0 +1,14 @@
1
+ Feature: cleanse a company name of suffix
2
+ In order to have clean company names that we could compare
3
+
4
+ Scenario: I have S.C's
5
+ Given I have these names
6
+ | Prototype |
7
+ | Prototype Interactive Laboratories (dead in '99) |
8
+ | prototype |
9
+ | prototype interactive laboratories (dead in '99) |
10
+ And I have these suffixes
11
+ | SC |
12
+ | S.C |
13
+ And I combine them
14
+ Then the cleansed version should match the name
@@ -0,0 +1,26 @@
1
+ Feature: cleanse a company name of suffix
2
+ In order to have clean company names that we could compare
3
+
4
+ Scenario: I have Inc's
5
+ Given I have these names
6
+ | Prototype |
7
+ | Prototype Interactive Laboratories (dead in '99) |
8
+ And I have these suffixes
9
+ | Deposit |
10
+ | Credit Union |
11
+ | Societa Per Azioni |
12
+ | Club |
13
+ | Foundation |
14
+ | Fund |
15
+ | Institute |
16
+ | Society |
17
+ | Union |
18
+ | Syndicate |
19
+ | Church |
20
+ | College |
21
+ | University |
22
+ | Chartered |
23
+ | League |
24
+ | Committee |
25
+ And I combine them
26
+ Then the cleansed version should match the name
@@ -0,0 +1,12 @@
1
+ Feature: cleanse a company name of suffix
2
+ In order to have clean company names that we could compare
3
+
4
+ Scenario: I have SPA's
5
+ Given I have these names
6
+ | Prototype |
7
+ | Prototype Interactive Laboratories (dead in '99) |
8
+ And I have these suffixes
9
+ | S.P.A |
10
+ | SPA |
11
+ And I combine them
12
+ Then the cleansed version should match the name
@@ -0,0 +1,24 @@
1
+ require "fluffix"
2
+ require "awesome_print"
3
+
4
+ # generics
5
+ Given(/^I have these names$/i) do |names|
6
+ names = names.raw.flatten
7
+ @names = names + names.map(&:downcase)
8
+ end
9
+
10
+ Given(/^I have these suffixes$/i) do |suffixes|
11
+ suffixes = suffixes.raw.flatten
12
+ @suffixes = suffixes + suffixes.map(&:downcase)
13
+ @suffixes = @suffixes + @suffixes.map{ |s| "#{s}." }
14
+ end
15
+
16
+ Given(/^I combine them$/i) do
17
+ @examples = @names.product(@suffixes).map{ |i| [i.join(" "), i.first] }
18
+ end
19
+
20
+ Then(/^The cleansed version should match the name$/i) do
21
+ @examples.each do |example|
22
+ Fluffix::US.cleanse(example.first).should eq(example.last)
23
+ end
24
+ end
@@ -0,0 +1,29 @@
1
+ require 'simplecov'
2
+
3
+ module SimpleCov::Configuration
4
+ def clean_filters
5
+ @filters = []
6
+ end
7
+ end
8
+
9
+ SimpleCov.configure do
10
+ clean_filters
11
+ load_adapter 'test_frameworks'
12
+ end
13
+
14
+ ENV["COVERAGE"] && SimpleCov.start do
15
+ add_filter "/.rvm/"
16
+ end
17
+ require 'bundler'
18
+ begin
19
+ Bundler.setup(:default, :development)
20
+ rescue Bundler::BundlerError => e
21
+ $stderr.puts e.message
22
+ $stderr.puts "Run `bundle install` to install missing gems"
23
+ exit e.status_code
24
+ end
25
+
26
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
27
+ require 'fluffix'
28
+
29
+ require 'rspec/expectations'
@@ -0,0 +1,21 @@
1
+ require "yaml"
2
+ require "awesome_print"
3
+
4
+ module Fluffix
5
+
6
+ RULES = YAML.load(File.open("entity_types_edited.yml"))[0].flatten.uniq.sort{ |a, b| b.size <=> a.size }.map{ |r| Regexp::new(" #{r}\.?$", Regexp::IGNORECASE) }
7
+
8
+ class US
9
+ def self.cleanse(text)
10
+ raise("Must quack like a string") unless text.respond_to?(:to_s)
11
+ RULES.each do |rule|
12
+ # should return the text in the original form and should handle / ignore punctuation
13
+ if text =~ rule
14
+ result = text.sub(rule, '')
15
+ return result
16
+ end
17
+ end
18
+ return text
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,86 @@
1
+ # https://en.wikipedia.org/wiki/Types_of_business_entity
2
+
3
+ # pull the data, and through a series of techniques, tip out the likely business suffixes
4
+ require "nokogiri"
5
+ require "open-uri"
6
+ require "awesome_print"
7
+ require "yaml"
8
+
9
+ task :basic_get do
10
+ html = open("https://en.wikipedia.org/wiki/Types_of_business_entity")
11
+ dom = Nokogiri::HTML.parse(html)
12
+
13
+ out_data = []
14
+ dom.css("ul li").each do |tag|
15
+ parts = tag.text.split("≈")
16
+ if parts.length > 1
17
+ p2 = parts.inject([]) do |r, i|
18
+ i.split(/[\n]/).each do |j|
19
+ j.strip!
20
+ j.downcase!
21
+ j.sub!(/\(\s?i.e.\s[^\)]+\)/, "")
22
+ if j.end_with?(":")
23
+ j.gsub!(/\(([^\)]+)\)/, '~~~\1')
24
+ j = j.split("~~~").map{ |item| item.sub(/[\s\:]+$/, '') }
25
+ r << j
26
+ end
27
+ end
28
+ r
29
+ end
30
+ out_data << p2
31
+ end
32
+ end
33
+
34
+ File.open("entity_types.yml", "w") do |f|
35
+ f.write(out_data.to_yaml)
36
+ end
37
+ end
38
+
39
+ task :TLD do
40
+ # https://github.com/alexrabarts/tld
41
+ # get a list of all TLD's if that one wont work
42
+ end
43
+
44
+ task :open_company_data do
45
+ # http://index.okfn.org/dataset/companies/
46
+ # http://download.companieshouse.gov.uk/en_output.html
47
+ # pull a whole bunch of data in the hopes that it gives us clues about this
48
+
49
+ # GET http://download.companieshouse.gov.uk/BasicCompanyData-2015-12-01-part1_5.zip
50
+
51
+ # unzip it
52
+
53
+ #
54
+ require "csv"
55
+ output = Hash.new(0)
56
+ CSV.foreach("/Users/johnjansen/Downloads/BasicCompanyData-2015-12-01-part1_5.csv", headers:true) do |row|
57
+ parts = row["CompanyName"].split(" ").map(&:strip)
58
+ while not parts.empty?
59
+ output[parts.join(" ")] += 1
60
+ parts.shift
61
+ end
62
+ end
63
+ ap output.map{ |(a, b)| [a, b * b] }.sort{ |a,b| b[1] <=> a[1] }[0..100]
64
+ end
65
+
66
+ task :edgar do
67
+ # https://www.sec.gov/edgar/searchedgar/ftpusers.htm
68
+ # pull all of edgar and figure out the highest frequency suffixes
69
+ # i.e. for "something incorporated"
70
+ # 1) reverse it => "incorporated something"
71
+ # 2) split it => ["incorporated", "something"]
72
+ # 3) ngram it from longest to shortest =>
73
+ # ["incorporated", "something"]
74
+ # ["incorporated"]
75
+ # 4) go through the entire db and count instances of ngrams =>
76
+ # ["incorporated", "something"] => 1
77
+ # ["incorporated"] => 1
78
+ # 5) suqare the frequencies and trim the long tail
79
+ # 6) manually clean the result
80
+ end
81
+
82
+ task :dump do
83
+ File.open("entity_types_edited.yml", "r") do |f|
84
+ ap YAML.load(f)[0].flatten.uniq.map(&:downcase).sort{ |a, b| b.size <=> a.size }
85
+ end
86
+ end
@@ -0,0 +1,7 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "Fluffix" do
4
+ it "fails" do
5
+ fail "hey buddy, you should probably rename this file and start specing for real"
6
+ end
7
+ end
@@ -0,0 +1,29 @@
1
+ require 'simplecov'
2
+
3
+ module SimpleCov::Configuration
4
+ def clean_filters
5
+ @filters = []
6
+ end
7
+ end
8
+
9
+ SimpleCov.configure do
10
+ clean_filters
11
+ load_adapter 'test_frameworks'
12
+ end
13
+
14
+ ENV["COVERAGE"] && SimpleCov.start do
15
+ add_filter "/.rvm/"
16
+ end
17
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
18
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
19
+
20
+ require 'rspec'
21
+ require 'fluffix'
22
+
23
+ # Requires supporting files with custom matchers and macros, etc,
24
+ # in ./support/ and its subdirectories.
25
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
26
+
27
+ RSpec.configure do |config|
28
+
29
+ end