fias 0.0.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -22
- data/.rubocop.yml +7 -0
- data/.travis.yml +10 -0
- data/Gemfile +1 -1
- data/LICENSE.txt +2 -2
- data/README.md +259 -155
- data/Rakefile +6 -1
- data/config/names.txt +0 -0
- data/config/synonyms.yml +50 -0
- data/examples/create.rb +106 -0
- data/examples/generate_index.rb +63 -0
- data/fias.gemspec +33 -21
- data/lib/fias.rb +197 -10
- data/lib/fias/config.rb +74 -0
- data/lib/fias/import/copy.rb +62 -0
- data/lib/fias/import/dbf.rb +81 -0
- data/lib/fias/import/download_service.rb +37 -0
- data/lib/fias/import/restore_parent_id.rb +51 -0
- data/lib/fias/import/tables.rb +74 -0
- data/lib/fias/name/append.rb +30 -0
- data/lib/fias/name/canonical.rb +42 -0
- data/lib/fias/name/extract.rb +85 -0
- data/lib/fias/name/house_number.rb +71 -0
- data/lib/fias/name/split.rb +60 -0
- data/lib/fias/name/synonyms.rb +93 -0
- data/lib/fias/query.rb +43 -0
- data/lib/fias/query/estimate.rb +67 -0
- data/lib/fias/query/finder.rb +75 -0
- data/lib/fias/query/params.rb +101 -0
- data/lib/fias/railtie.rb +3 -17
- data/lib/fias/version.rb +1 -1
- data/spec/fixtures/ACTSTAT.DBF +0 -0
- data/spec/fixtures/NORDOC99.DBF +0 -0
- data/spec/fixtures/STRSTAT.DBF +0 -0
- data/spec/fixtures/addressing.yml +93 -0
- data/spec/fixtures/query.yml +79 -0
- data/spec/fixtures/query_sanitization.yml +75 -0
- data/spec/fixtures/status_append.yml +60 -0
- data/spec/lib/import/copy_spec.rb +44 -0
- data/spec/lib/import/dbf_spec.rb +28 -0
- data/spec/lib/import/download_service_spec.rb +15 -0
- data/spec/lib/import/restore_parent_id_spec.rb +34 -0
- data/spec/lib/import/tables_spec.rb +26 -0
- data/spec/lib/name/append_spec.rb +14 -0
- data/spec/lib/name/canonical_spec.rb +20 -0
- data/spec/lib/name/extract_spec.rb +67 -0
- data/spec/lib/name/house_number_spec.rb +45 -0
- data/spec/lib/name/query_spec.rb +21 -0
- data/spec/lib/name/split_spec.rb +15 -0
- data/spec/lib/name/synonyms_spec.rb +51 -0
- data/spec/lib/query/params_spec.rb +15 -0
- data/spec/lib/query_spec.rb +27 -0
- data/spec/spec_helper.rb +30 -0
- data/spec/support/db.rb +30 -0
- data/spec/support/query.rb +13 -0
- data/tasks/db.rake +52 -0
- data/tasks/download.rake +15 -0
- metadata +246 -64
- data/lib/fias/active_record/address_object.rb +0 -231
- data/lib/fias/active_record/address_object_type.rb +0 -15
- data/lib/fias/dbf_wrapper.rb +0 -90
- data/lib/fias/importer.rb +0 -30
- data/lib/fias/importer/base.rb +0 -59
- data/lib/fias/importer/pg.rb +0 -81
- data/lib/fias/importer/sqlite.rb +0 -38
- data/lib/generators/fias/migration.rb +0 -34
- data/lib/generators/fias/templates/create_fias_tables.rb +0 -5
- data/tasks/fias.rake +0 -68
@@ -0,0 +1,85 @@
|
|
1
|
+
module Fias
|
2
|
+
module Name
|
3
|
+
module Extract
|
4
|
+
class << self
|
5
|
+
def extract(name)
|
6
|
+
return if name.blank?
|
7
|
+
name = cleanup(name)
|
8
|
+
|
9
|
+
matches = find(name)
|
10
|
+
rates = assign_rates(name, matches)
|
11
|
+
winner = pick_winner(rates)
|
12
|
+
return [name] unless winner
|
13
|
+
|
14
|
+
extract_name(name, winner)
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def cleanup(name)
|
20
|
+
name.split(' ').join(' ').strip
|
21
|
+
end
|
22
|
+
|
23
|
+
def find(name)
|
24
|
+
matches = Fias.config.index.keys.map do |query|
|
25
|
+
match = name.match(/(\s|^)(#{Regexp.escape(query)})(\.|\s|$)/ui)
|
26
|
+
match if match && match[2]
|
27
|
+
end
|
28
|
+
matches.compact
|
29
|
+
end
|
30
|
+
|
31
|
+
def assign_rates(name, matches)
|
32
|
+
matches.map { |match| rate_match(name, match) }
|
33
|
+
end
|
34
|
+
|
35
|
+
def rate_match(name, match)
|
36
|
+
short_name = match[2]
|
37
|
+
|
38
|
+
rate =
|
39
|
+
(ends_with_dot?(short_name) * REWARD[:dot]) +
|
40
|
+
(starts_with_small_letter?(short_name) * REWARD[:small_letter]) +
|
41
|
+
(border_proximity(name, match))
|
42
|
+
|
43
|
+
rate *= 100
|
44
|
+
rate += short_name.size
|
45
|
+
|
46
|
+
[rate, match]
|
47
|
+
end
|
48
|
+
|
49
|
+
def border_proximity(name, match)
|
50
|
+
head = name.size - match.begin(1) + REWARD[:head]
|
51
|
+
tail = match.end(2)
|
52
|
+
[head, tail].max
|
53
|
+
end
|
54
|
+
|
55
|
+
def ends_with_dot?(value)
|
56
|
+
value[-1] == '.' ? 1 : 0
|
57
|
+
end
|
58
|
+
|
59
|
+
def starts_with_small_letter?(value)
|
60
|
+
value[0] =~ SMALL_LETTER ? 1 : 0
|
61
|
+
end
|
62
|
+
|
63
|
+
def pick_winner(rates)
|
64
|
+
rates = rates.sort_by(&:first).reverse
|
65
|
+
rate, match = rates.first
|
66
|
+
return if (rates[1..-1] || []).any? { |(r, _)| rate == r }
|
67
|
+
match
|
68
|
+
end
|
69
|
+
|
70
|
+
def extract_name(name, winner)
|
71
|
+
short_name = winner[2]
|
72
|
+
toponym = cleanup(name.gsub(winner.regexp, ' '))
|
73
|
+
return [name] if toponym.strip.blank?
|
74
|
+
[cleanup(toponym), Canonical.canonical(short_name)].flatten
|
75
|
+
end
|
76
|
+
|
77
|
+
SMALL_LETTER = /[а-яё]/u
|
78
|
+
|
79
|
+
REWARD = {
|
80
|
+
dot: 3, small_letter: 2, head: 1
|
81
|
+
}
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Fias
|
2
|
+
module Name
|
3
|
+
module HouseNumber
|
4
|
+
class << self
|
5
|
+
def extract(name)
|
6
|
+
return [name, nil] unless contains_number?(name)
|
7
|
+
|
8
|
+
name, number =
|
9
|
+
try_split_by_colon(name) ||
|
10
|
+
try_housing(name) ||
|
11
|
+
try_house_word(name) ||
|
12
|
+
try_ends_with_number(name)
|
13
|
+
|
14
|
+
[name.strip, number.strip]
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def contains_number?(name)
|
20
|
+
!(name =~ JUST_A_NUMBER) && !(name =~ LINE_OR_MICRODISTRICT) &&
|
21
|
+
(
|
22
|
+
name =~ COLON ||
|
23
|
+
name =~ ENDS_WITH_NUMBER ||
|
24
|
+
name =~ HOUSE_WORD ||
|
25
|
+
name =~ NUMBER_WITH_HOUSING
|
26
|
+
)
|
27
|
+
end
|
28
|
+
|
29
|
+
def try_split_by_colon(name)
|
30
|
+
name.split(/\s*,\s*/, 2) if name =~ COLON
|
31
|
+
end
|
32
|
+
|
33
|
+
def try_housing(name)
|
34
|
+
match = name.match(NUMBER_WITH_HOUSING)
|
35
|
+
[match.pre_match, "#{match} #{match.post_match}"] if match
|
36
|
+
end
|
37
|
+
|
38
|
+
def try_house_word(name)
|
39
|
+
match = name.match(HOUSE_WORD)
|
40
|
+
[match.pre_match, match.post_match] if match
|
41
|
+
end
|
42
|
+
|
43
|
+
def try_ends_with_number(name)
|
44
|
+
match = name.match(ENDS_WITH_NUMBER)
|
45
|
+
[match.pre_match, match[1]] if match
|
46
|
+
end
|
47
|
+
|
48
|
+
def or_words(words)
|
49
|
+
words
|
50
|
+
.sort_by(&:length)
|
51
|
+
.reverse
|
52
|
+
.map { |w| Regexp.escape(w) }
|
53
|
+
.join('|')
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
COLON = /\,/
|
58
|
+
JUST_A_NUMBER = /^[\s\d]+$/
|
59
|
+
STOPWORDS = /(микрорайон|линия|микр|мкрн|мкр|лин)/ui
|
60
|
+
LINE_OR_MICRODISTRICT = /#{STOPWORDS}\.?[\s\w+]?\d+$/ui
|
61
|
+
NUMBER = /\d+\/?#{Fias::LETTERS}?\d*/ui
|
62
|
+
ENDS_WITH_NUMBER = /(#{NUMBER})$/ui
|
63
|
+
HOUSE_WORDS = %w(ом д дом вл кв)
|
64
|
+
HOUSE_WORD =
|
65
|
+
/(\s|\,|\.|^)(#{or_words(HOUSE_WORDS)})(\s|\,|\.|$)/ui
|
66
|
+
HOUSING_WORDS = %w(корпус корп к)
|
67
|
+
NUMBER_WITH_HOUSING =
|
68
|
+
/#{NUMBER}[\s\,\.]+(#{or_words(HOUSING_WORDS)})[\s\,\.]+#{NUMBER}/ui
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module Fias
|
2
|
+
module Name
|
3
|
+
module Split
|
4
|
+
class << self
|
5
|
+
def split(name)
|
6
|
+
words = sanitize(name).scan(Fias.word)
|
7
|
+
words = cleanup_brackets(words)
|
8
|
+
words = split_all_initials(words)
|
9
|
+
words = split_all_dotwords(words)
|
10
|
+
|
11
|
+
words
|
12
|
+
.reject(&:blank?)
|
13
|
+
.flatten
|
14
|
+
.uniq
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def sanitize(name)
|
20
|
+
Unicode.downcase(name).gsub('ё', 'е').gsub(QUOTAS, '')
|
21
|
+
end
|
22
|
+
|
23
|
+
def cleanup_brackets(words)
|
24
|
+
words.map { |word, _| word.gsub(BRACKETS, '') }
|
25
|
+
end
|
26
|
+
|
27
|
+
def split_all_initials(words)
|
28
|
+
words
|
29
|
+
.map { |word, _| split_initials(word) || word }
|
30
|
+
.compact
|
31
|
+
.flatten
|
32
|
+
end
|
33
|
+
|
34
|
+
def split_initials(word)
|
35
|
+
m_matches = word.match(Fias::INITIALS)
|
36
|
+
return m_matches.values_at(1, 3) if m_matches
|
37
|
+
|
38
|
+
s_matches = word.match(Fias::SINGLE_INITIAL)
|
39
|
+
return s_matches.values_at(2, 3) if s_matches
|
40
|
+
end
|
41
|
+
|
42
|
+
def split_all_dotwords(words)
|
43
|
+
words
|
44
|
+
.map { |word, _| split_dotwords(word) || word }
|
45
|
+
.compact
|
46
|
+
end
|
47
|
+
|
48
|
+
def split_dotwords(word)
|
49
|
+
return unless word =~ DOTWORD
|
50
|
+
dotwords = word.gsub(DOTWORD, '\1 ')
|
51
|
+
dotwords.split(' ').uniq.delete_if(&:blank?)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
DOTWORD = /(#{LETTERS}{2,}\.)/ui
|
56
|
+
BRACKETS = /(\(|\))/
|
57
|
+
QUOTAS = /[\"\']/
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
module Fias
|
2
|
+
module Name
|
3
|
+
module Synonyms
|
4
|
+
class << self
|
5
|
+
def expand(name)
|
6
|
+
Split
|
7
|
+
.split(name)
|
8
|
+
.map { |token| Array.wrap(tokenize(name, token)) }
|
9
|
+
end
|
10
|
+
|
11
|
+
def tokens(name)
|
12
|
+
expand(name).flatten.uniq
|
13
|
+
end
|
14
|
+
|
15
|
+
def forms(name)
|
16
|
+
recombine(expand(name))
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def tokenize(name, token)
|
22
|
+
synonyms(token) ||
|
23
|
+
bracketed(name, token) ||
|
24
|
+
proper_names(token) ||
|
25
|
+
initials(token) ||
|
26
|
+
annivesary(token) ||
|
27
|
+
numerals(token) ||
|
28
|
+
token
|
29
|
+
end
|
30
|
+
|
31
|
+
def synonyms(token)
|
32
|
+
Fias.config.synonyms_index[token]
|
33
|
+
end
|
34
|
+
|
35
|
+
def bracketed(name, token)
|
36
|
+
match = name.match(IN_BRACKETS)
|
37
|
+
[token, OPTIONAL] if match && match[1].include?(token)
|
38
|
+
end
|
39
|
+
|
40
|
+
def proper_names(token)
|
41
|
+
[token, OPTIONAL] if Fias.config.proper_names.include?(token)
|
42
|
+
end
|
43
|
+
|
44
|
+
def initials(token)
|
45
|
+
return unless
|
46
|
+
(Fias::INITIALS =~ token) && (Fias::SINGLE_INITIAL =~ token)
|
47
|
+
|
48
|
+
[token, OPTIONAL]
|
49
|
+
end
|
50
|
+
|
51
|
+
def annivesary(token)
|
52
|
+
return unless token =~ Fias::ANNIVESARIES
|
53
|
+
|
54
|
+
ANNIVESARY_FORMS.map do |form|
|
55
|
+
token.gsub(Fias::ANNIVESARIES, form)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def numerals(token)
|
60
|
+
return unless (/^\d+/ =~ token) || (Fias::ANNIVESARIES =~ token)
|
61
|
+
numerals_for(token)
|
62
|
+
end
|
63
|
+
|
64
|
+
def numerals_for(numeral)
|
65
|
+
n = numeral.gsub(/[^\d]/, '')
|
66
|
+
|
67
|
+
suffixes =
|
68
|
+
NUMERAL_SUFFIXES.map do |suffix|
|
69
|
+
["#{n}#{suffix}", "#{n}-#{suffix}"]
|
70
|
+
end
|
71
|
+
|
72
|
+
suffixes.flatten + [n]
|
73
|
+
end
|
74
|
+
|
75
|
+
def recombine(variants)
|
76
|
+
return variants if variants.empty?
|
77
|
+
head, *rest = variants
|
78
|
+
|
79
|
+
forms = head.product(*rest)
|
80
|
+
forms
|
81
|
+
.map { |variant| variant.reject(&:blank?).sort.join(' ') }
|
82
|
+
.flatten
|
83
|
+
end
|
84
|
+
|
85
|
+
IN_BRACKETS = /\((.*)\)/
|
86
|
+
OPTIONAL = ''
|
87
|
+
NUMERAL_SUFFIXES = %w(й я е ая ий ый ой ые ое го)
|
88
|
+
ANNIVESARY_FORMS =
|
89
|
+
['\1-летия', '\1-лет', '\1 летия', '\1 лет', '\1-летие', '\1 летие']
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
data/lib/fias/query.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
module Fias
|
2
|
+
module Query
|
3
|
+
def initialize(params)
|
4
|
+
@params = Params.new(params)
|
5
|
+
@finder = Finder.new(@params, method(:find))
|
6
|
+
end
|
7
|
+
|
8
|
+
attr_reader :params
|
9
|
+
|
10
|
+
def perform
|
11
|
+
assumption = @finder.assumption
|
12
|
+
estimate(assumption)
|
13
|
+
end
|
14
|
+
|
15
|
+
protected
|
16
|
+
|
17
|
+
def find(_tokens)
|
18
|
+
fail NotImplementedError
|
19
|
+
end
|
20
|
+
|
21
|
+
def estimate(assumption)
|
22
|
+
chains = estimate_chains(assumption)
|
23
|
+
reject_invalid_chains(chains)
|
24
|
+
end
|
25
|
+
|
26
|
+
def estimate_chains(assumption)
|
27
|
+
assumption
|
28
|
+
.map { |chain| [rate(chain), chain.first] }
|
29
|
+
.sort_by(&:first)
|
30
|
+
.reverse
|
31
|
+
end
|
32
|
+
|
33
|
+
def reject_invalid_chains(chains)
|
34
|
+
return chains if chains.empty?
|
35
|
+
highest_rate = chains.first.first
|
36
|
+
chains.keep_if { |c| c.first == highest_rate }
|
37
|
+
end
|
38
|
+
|
39
|
+
def rate(chain)
|
40
|
+
Estimate.new(@params, chain).estimate
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module Fias
|
2
|
+
module Query
|
3
|
+
class Estimate
|
4
|
+
def initialize(params, chain)
|
5
|
+
@params = params
|
6
|
+
@chain = chain
|
7
|
+
end
|
8
|
+
|
9
|
+
def estimate
|
10
|
+
for_subject +
|
11
|
+
for_found_parts +
|
12
|
+
for_type +
|
13
|
+
for_deepness +
|
14
|
+
for_name_proximity
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def for_subject
|
20
|
+
expected_type = @params.sanitized.keys.first
|
21
|
+
expected_type == @chain.first[:key] ? RATES[:subject] : 0
|
22
|
+
end
|
23
|
+
|
24
|
+
def for_found_parts
|
25
|
+
@chain.size * RATES[:found_part]
|
26
|
+
end
|
27
|
+
|
28
|
+
def for_type
|
29
|
+
@params.sanitized.sum do |key, (_, *expected_status)|
|
30
|
+
received_status = chain_by_key[key].try(:[], :abbr)
|
31
|
+
|
32
|
+
status_found =
|
33
|
+
expected_status.present? &&
|
34
|
+
expected_status.include?(received_status)
|
35
|
+
|
36
|
+
status_found ? RATES[:type] : 0
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def for_deepness
|
41
|
+
@chain.first[:ancestry].size * RATES[:deep]
|
42
|
+
end
|
43
|
+
|
44
|
+
def for_name_proximity
|
45
|
+
@params.synonyms.sum do |key, (expected, _)|
|
46
|
+
given = chain_by_key[key].try(:[], :tokens) || []
|
47
|
+
expected = expected.flatten.uniq
|
48
|
+
|
49
|
+
proximity = (given & expected).size
|
50
|
+
proximity * RATES[:name]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def chain_by_key
|
55
|
+
@chain_by_key ||= @chain.index_by { |item| item[:key] }
|
56
|
+
end
|
57
|
+
|
58
|
+
RATES = {
|
59
|
+
subject: 10000, # It's most important to match street if street is requested
|
60
|
+
found_part: 1000, # Than, maximum parts number should coincide
|
61
|
+
type: 100, # Than, status should coincide,
|
62
|
+
name: 5, # Than, how close name matches are
|
63
|
+
deep: -1 # Than, how deep is matching chain situated
|
64
|
+
}
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Fias
|
2
|
+
module Query
|
3
|
+
class Finder
|
4
|
+
def initialize(params, find)
|
5
|
+
@params = params
|
6
|
+
@find = find
|
7
|
+
end
|
8
|
+
|
9
|
+
def assumption
|
10
|
+
find_endpoints
|
11
|
+
return [] if @endpoints.blank?
|
12
|
+
reject_inconsistent_chains
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def find_endpoints
|
18
|
+
@endpoints = @params.split.keys.map do |key|
|
19
|
+
find_endpoint(key)
|
20
|
+
end
|
21
|
+
@endpoints = Hash[@endpoints]
|
22
|
+
inject_key_to_endpoints
|
23
|
+
end
|
24
|
+
|
25
|
+
def find_endpoint(key)
|
26
|
+
words = @params.split[key]
|
27
|
+
endpoints = find(words)
|
28
|
+
endpoints = reject_endpoints(endpoints, key)
|
29
|
+
[key, endpoints]
|
30
|
+
end
|
31
|
+
|
32
|
+
def find(words)
|
33
|
+
@find.call(words)
|
34
|
+
end
|
35
|
+
|
36
|
+
def reject_endpoints(endpoints, key)
|
37
|
+
forms = @params.forms[key]
|
38
|
+
|
39
|
+
endpoints.reject do |endpoint|
|
40
|
+
(forms & endpoint[:forms]).blank?
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def inject_key_to_endpoints
|
45
|
+
@endpoints.each do |key, endpoints|
|
46
|
+
endpoints.each { |endpoint| endpoint[:key] = key }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def reject_inconsistent_chains
|
51
|
+
starting_endpoints = @endpoints.values.first
|
52
|
+
parents = endpoints_parents
|
53
|
+
|
54
|
+
chains = starting_endpoints.map do |endpoint|
|
55
|
+
overlaps = parents.keys & endpoint[:ancestry]
|
56
|
+
|
57
|
+
if parents.blank? || overlaps.present?
|
58
|
+
[endpoint] + endpoint[:ancestry].map { |id| parents[id] }.compact
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
chains.compact
|
63
|
+
end
|
64
|
+
|
65
|
+
def endpoints_parents
|
66
|
+
parents = @endpoints.values.slice(1..-1)
|
67
|
+
return [] if parents.nil?
|
68
|
+
parents
|
69
|
+
.flatten
|
70
|
+
.reverse
|
71
|
+
.index_by { |endpoint| endpoint[:id] }
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|