fias 0.0.2 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +15 -22
- data/.rubocop.yml +7 -0
- data/.travis.yml +10 -0
- data/Gemfile +1 -1
- data/LICENSE.txt +2 -2
- data/README.md +259 -155
- data/Rakefile +6 -1
- data/config/names.txt +0 -0
- data/config/synonyms.yml +50 -0
- data/examples/create.rb +106 -0
- data/examples/generate_index.rb +63 -0
- data/fias.gemspec +33 -21
- data/lib/fias.rb +197 -10
- data/lib/fias/config.rb +74 -0
- data/lib/fias/import/copy.rb +62 -0
- data/lib/fias/import/dbf.rb +81 -0
- data/lib/fias/import/download_service.rb +37 -0
- data/lib/fias/import/restore_parent_id.rb +51 -0
- data/lib/fias/import/tables.rb +74 -0
- data/lib/fias/name/append.rb +30 -0
- data/lib/fias/name/canonical.rb +42 -0
- data/lib/fias/name/extract.rb +85 -0
- data/lib/fias/name/house_number.rb +71 -0
- data/lib/fias/name/split.rb +60 -0
- data/lib/fias/name/synonyms.rb +93 -0
- data/lib/fias/query.rb +43 -0
- data/lib/fias/query/estimate.rb +67 -0
- data/lib/fias/query/finder.rb +75 -0
- data/lib/fias/query/params.rb +101 -0
- data/lib/fias/railtie.rb +3 -17
- data/lib/fias/version.rb +1 -1
- data/spec/fixtures/ACTSTAT.DBF +0 -0
- data/spec/fixtures/NORDOC99.DBF +0 -0
- data/spec/fixtures/STRSTAT.DBF +0 -0
- data/spec/fixtures/addressing.yml +93 -0
- data/spec/fixtures/query.yml +79 -0
- data/spec/fixtures/query_sanitization.yml +75 -0
- data/spec/fixtures/status_append.yml +60 -0
- data/spec/lib/import/copy_spec.rb +44 -0
- data/spec/lib/import/dbf_spec.rb +28 -0
- data/spec/lib/import/download_service_spec.rb +15 -0
- data/spec/lib/import/restore_parent_id_spec.rb +34 -0
- data/spec/lib/import/tables_spec.rb +26 -0
- data/spec/lib/name/append_spec.rb +14 -0
- data/spec/lib/name/canonical_spec.rb +20 -0
- data/spec/lib/name/extract_spec.rb +67 -0
- data/spec/lib/name/house_number_spec.rb +45 -0
- data/spec/lib/name/query_spec.rb +21 -0
- data/spec/lib/name/split_spec.rb +15 -0
- data/spec/lib/name/synonyms_spec.rb +51 -0
- data/spec/lib/query/params_spec.rb +15 -0
- data/spec/lib/query_spec.rb +27 -0
- data/spec/spec_helper.rb +30 -0
- data/spec/support/db.rb +30 -0
- data/spec/support/query.rb +13 -0
- data/tasks/db.rake +52 -0
- data/tasks/download.rake +15 -0
- metadata +246 -64
- data/lib/fias/active_record/address_object.rb +0 -231
- data/lib/fias/active_record/address_object_type.rb +0 -15
- data/lib/fias/dbf_wrapper.rb +0 -90
- data/lib/fias/importer.rb +0 -30
- data/lib/fias/importer/base.rb +0 -59
- data/lib/fias/importer/pg.rb +0 -81
- data/lib/fias/importer/sqlite.rb +0 -38
- data/lib/generators/fias/migration.rb +0 -34
- data/lib/generators/fias/templates/create_fias_tables.rb +0 -5
- data/tasks/fias.rake +0 -68
@@ -0,0 +1,85 @@
|
|
1
|
+
module Fias
|
2
|
+
module Name
|
3
|
+
module Extract
|
4
|
+
class << self
|
5
|
+
def extract(name)
|
6
|
+
return if name.blank?
|
7
|
+
name = cleanup(name)
|
8
|
+
|
9
|
+
matches = find(name)
|
10
|
+
rates = assign_rates(name, matches)
|
11
|
+
winner = pick_winner(rates)
|
12
|
+
return [name] unless winner
|
13
|
+
|
14
|
+
extract_name(name, winner)
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def cleanup(name)
|
20
|
+
name.split(' ').join(' ').strip
|
21
|
+
end
|
22
|
+
|
23
|
+
def find(name)
|
24
|
+
matches = Fias.config.index.keys.map do |query|
|
25
|
+
match = name.match(/(\s|^)(#{Regexp.escape(query)})(\.|\s|$)/ui)
|
26
|
+
match if match && match[2]
|
27
|
+
end
|
28
|
+
matches.compact
|
29
|
+
end
|
30
|
+
|
31
|
+
def assign_rates(name, matches)
|
32
|
+
matches.map { |match| rate_match(name, match) }
|
33
|
+
end
|
34
|
+
|
35
|
+
def rate_match(name, match)
|
36
|
+
short_name = match[2]
|
37
|
+
|
38
|
+
rate =
|
39
|
+
(ends_with_dot?(short_name) * REWARD[:dot]) +
|
40
|
+
(starts_with_small_letter?(short_name) * REWARD[:small_letter]) +
|
41
|
+
(border_proximity(name, match))
|
42
|
+
|
43
|
+
rate *= 100
|
44
|
+
rate += short_name.size
|
45
|
+
|
46
|
+
[rate, match]
|
47
|
+
end
|
48
|
+
|
49
|
+
def border_proximity(name, match)
|
50
|
+
head = name.size - match.begin(1) + REWARD[:head]
|
51
|
+
tail = match.end(2)
|
52
|
+
[head, tail].max
|
53
|
+
end
|
54
|
+
|
55
|
+
def ends_with_dot?(value)
|
56
|
+
value[-1] == '.' ? 1 : 0
|
57
|
+
end
|
58
|
+
|
59
|
+
def starts_with_small_letter?(value)
|
60
|
+
value[0] =~ SMALL_LETTER ? 1 : 0
|
61
|
+
end
|
62
|
+
|
63
|
+
def pick_winner(rates)
|
64
|
+
rates = rates.sort_by(&:first).reverse
|
65
|
+
rate, match = rates.first
|
66
|
+
return if (rates[1..-1] || []).any? { |(r, _)| rate == r }
|
67
|
+
match
|
68
|
+
end
|
69
|
+
|
70
|
+
def extract_name(name, winner)
|
71
|
+
short_name = winner[2]
|
72
|
+
toponym = cleanup(name.gsub(winner.regexp, ' '))
|
73
|
+
return [name] if toponym.strip.blank?
|
74
|
+
[cleanup(toponym), Canonical.canonical(short_name)].flatten
|
75
|
+
end
|
76
|
+
|
77
|
+
SMALL_LETTER = /[а-яё]/u
|
78
|
+
|
79
|
+
REWARD = {
|
80
|
+
dot: 3, small_letter: 2, head: 1
|
81
|
+
}
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Fias
|
2
|
+
module Name
|
3
|
+
module HouseNumber
|
4
|
+
class << self
|
5
|
+
def extract(name)
|
6
|
+
return [name, nil] unless contains_number?(name)
|
7
|
+
|
8
|
+
name, number =
|
9
|
+
try_split_by_colon(name) ||
|
10
|
+
try_housing(name) ||
|
11
|
+
try_house_word(name) ||
|
12
|
+
try_ends_with_number(name)
|
13
|
+
|
14
|
+
[name.strip, number.strip]
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def contains_number?(name)
|
20
|
+
!(name =~ JUST_A_NUMBER) && !(name =~ LINE_OR_MICRODISTRICT) &&
|
21
|
+
(
|
22
|
+
name =~ COLON ||
|
23
|
+
name =~ ENDS_WITH_NUMBER ||
|
24
|
+
name =~ HOUSE_WORD ||
|
25
|
+
name =~ NUMBER_WITH_HOUSING
|
26
|
+
)
|
27
|
+
end
|
28
|
+
|
29
|
+
def try_split_by_colon(name)
|
30
|
+
name.split(/\s*,\s*/, 2) if name =~ COLON
|
31
|
+
end
|
32
|
+
|
33
|
+
def try_housing(name)
|
34
|
+
match = name.match(NUMBER_WITH_HOUSING)
|
35
|
+
[match.pre_match, "#{match} #{match.post_match}"] if match
|
36
|
+
end
|
37
|
+
|
38
|
+
def try_house_word(name)
|
39
|
+
match = name.match(HOUSE_WORD)
|
40
|
+
[match.pre_match, match.post_match] if match
|
41
|
+
end
|
42
|
+
|
43
|
+
def try_ends_with_number(name)
|
44
|
+
match = name.match(ENDS_WITH_NUMBER)
|
45
|
+
[match.pre_match, match[1]] if match
|
46
|
+
end
|
47
|
+
|
48
|
+
def or_words(words)
|
49
|
+
words
|
50
|
+
.sort_by(&:length)
|
51
|
+
.reverse
|
52
|
+
.map { |w| Regexp.escape(w) }
|
53
|
+
.join('|')
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
COLON = /\,/
|
58
|
+
JUST_A_NUMBER = /^[\s\d]+$/
|
59
|
+
STOPWORDS = /(микрорайон|линия|микр|мкрн|мкр|лин)/ui
|
60
|
+
LINE_OR_MICRODISTRICT = /#{STOPWORDS}\.?[\s\w+]?\d+$/ui
|
61
|
+
NUMBER = /\d+\/?#{Fias::LETTERS}?\d*/ui
|
62
|
+
ENDS_WITH_NUMBER = /(#{NUMBER})$/ui
|
63
|
+
HOUSE_WORDS = %w(ом д дом вл кв)
|
64
|
+
HOUSE_WORD =
|
65
|
+
/(\s|\,|\.|^)(#{or_words(HOUSE_WORDS)})(\s|\,|\.|$)/ui
|
66
|
+
HOUSING_WORDS = %w(корпус корп к)
|
67
|
+
NUMBER_WITH_HOUSING =
|
68
|
+
/#{NUMBER}[\s\,\.]+(#{or_words(HOUSING_WORDS)})[\s\,\.]+#{NUMBER}/ui
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module Fias
|
2
|
+
module Name
|
3
|
+
module Split
|
4
|
+
class << self
|
5
|
+
def split(name)
|
6
|
+
words = sanitize(name).scan(Fias.word)
|
7
|
+
words = cleanup_brackets(words)
|
8
|
+
words = split_all_initials(words)
|
9
|
+
words = split_all_dotwords(words)
|
10
|
+
|
11
|
+
words
|
12
|
+
.reject(&:blank?)
|
13
|
+
.flatten
|
14
|
+
.uniq
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def sanitize(name)
|
20
|
+
Unicode.downcase(name).gsub('ё', 'е').gsub(QUOTAS, '')
|
21
|
+
end
|
22
|
+
|
23
|
+
def cleanup_brackets(words)
|
24
|
+
words.map { |word, _| word.gsub(BRACKETS, '') }
|
25
|
+
end
|
26
|
+
|
27
|
+
def split_all_initials(words)
|
28
|
+
words
|
29
|
+
.map { |word, _| split_initials(word) || word }
|
30
|
+
.compact
|
31
|
+
.flatten
|
32
|
+
end
|
33
|
+
|
34
|
+
def split_initials(word)
|
35
|
+
m_matches = word.match(Fias::INITIALS)
|
36
|
+
return m_matches.values_at(1, 3) if m_matches
|
37
|
+
|
38
|
+
s_matches = word.match(Fias::SINGLE_INITIAL)
|
39
|
+
return s_matches.values_at(2, 3) if s_matches
|
40
|
+
end
|
41
|
+
|
42
|
+
def split_all_dotwords(words)
|
43
|
+
words
|
44
|
+
.map { |word, _| split_dotwords(word) || word }
|
45
|
+
.compact
|
46
|
+
end
|
47
|
+
|
48
|
+
def split_dotwords(word)
|
49
|
+
return unless word =~ DOTWORD
|
50
|
+
dotwords = word.gsub(DOTWORD, '\1 ')
|
51
|
+
dotwords.split(' ').uniq.delete_if(&:blank?)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
DOTWORD = /(#{LETTERS}{2,}\.)/ui
|
56
|
+
BRACKETS = /(\(|\))/
|
57
|
+
QUOTAS = /[\"\']/
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
module Fias
|
2
|
+
module Name
|
3
|
+
module Synonyms
|
4
|
+
class << self
|
5
|
+
def expand(name)
|
6
|
+
Split
|
7
|
+
.split(name)
|
8
|
+
.map { |token| Array.wrap(tokenize(name, token)) }
|
9
|
+
end
|
10
|
+
|
11
|
+
def tokens(name)
|
12
|
+
expand(name).flatten.uniq
|
13
|
+
end
|
14
|
+
|
15
|
+
def forms(name)
|
16
|
+
recombine(expand(name))
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def tokenize(name, token)
|
22
|
+
synonyms(token) ||
|
23
|
+
bracketed(name, token) ||
|
24
|
+
proper_names(token) ||
|
25
|
+
initials(token) ||
|
26
|
+
annivesary(token) ||
|
27
|
+
numerals(token) ||
|
28
|
+
token
|
29
|
+
end
|
30
|
+
|
31
|
+
def synonyms(token)
|
32
|
+
Fias.config.synonyms_index[token]
|
33
|
+
end
|
34
|
+
|
35
|
+
def bracketed(name, token)
|
36
|
+
match = name.match(IN_BRACKETS)
|
37
|
+
[token, OPTIONAL] if match && match[1].include?(token)
|
38
|
+
end
|
39
|
+
|
40
|
+
def proper_names(token)
|
41
|
+
[token, OPTIONAL] if Fias.config.proper_names.include?(token)
|
42
|
+
end
|
43
|
+
|
44
|
+
def initials(token)
|
45
|
+
return unless
|
46
|
+
(Fias::INITIALS =~ token) && (Fias::SINGLE_INITIAL =~ token)
|
47
|
+
|
48
|
+
[token, OPTIONAL]
|
49
|
+
end
|
50
|
+
|
51
|
+
def annivesary(token)
|
52
|
+
return unless token =~ Fias::ANNIVESARIES
|
53
|
+
|
54
|
+
ANNIVESARY_FORMS.map do |form|
|
55
|
+
token.gsub(Fias::ANNIVESARIES, form)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def numerals(token)
|
60
|
+
return unless (/^\d+/ =~ token) || (Fias::ANNIVESARIES =~ token)
|
61
|
+
numerals_for(token)
|
62
|
+
end
|
63
|
+
|
64
|
+
def numerals_for(numeral)
|
65
|
+
n = numeral.gsub(/[^\d]/, '')
|
66
|
+
|
67
|
+
suffixes =
|
68
|
+
NUMERAL_SUFFIXES.map do |suffix|
|
69
|
+
["#{n}#{suffix}", "#{n}-#{suffix}"]
|
70
|
+
end
|
71
|
+
|
72
|
+
suffixes.flatten + [n]
|
73
|
+
end
|
74
|
+
|
75
|
+
def recombine(variants)
|
76
|
+
return variants if variants.empty?
|
77
|
+
head, *rest = variants
|
78
|
+
|
79
|
+
forms = head.product(*rest)
|
80
|
+
forms
|
81
|
+
.map { |variant| variant.reject(&:blank?).sort.join(' ') }
|
82
|
+
.flatten
|
83
|
+
end
|
84
|
+
|
85
|
+
IN_BRACKETS = /\((.*)\)/
|
86
|
+
OPTIONAL = ''
|
87
|
+
NUMERAL_SUFFIXES = %w(й я е ая ий ый ой ые ое го)
|
88
|
+
ANNIVESARY_FORMS =
|
89
|
+
['\1-летия', '\1-лет', '\1 летия', '\1 лет', '\1-летие', '\1 летие']
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
data/lib/fias/query.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
module Fias
|
2
|
+
module Query
|
3
|
+
def initialize(params)
|
4
|
+
@params = Params.new(params)
|
5
|
+
@finder = Finder.new(@params, method(:find))
|
6
|
+
end
|
7
|
+
|
8
|
+
attr_reader :params
|
9
|
+
|
10
|
+
def perform
|
11
|
+
assumption = @finder.assumption
|
12
|
+
estimate(assumption)
|
13
|
+
end
|
14
|
+
|
15
|
+
protected
|
16
|
+
|
17
|
+
def find(_tokens)
|
18
|
+
fail NotImplementedError
|
19
|
+
end
|
20
|
+
|
21
|
+
def estimate(assumption)
|
22
|
+
chains = estimate_chains(assumption)
|
23
|
+
reject_invalid_chains(chains)
|
24
|
+
end
|
25
|
+
|
26
|
+
def estimate_chains(assumption)
|
27
|
+
assumption
|
28
|
+
.map { |chain| [rate(chain), chain.first] }
|
29
|
+
.sort_by(&:first)
|
30
|
+
.reverse
|
31
|
+
end
|
32
|
+
|
33
|
+
def reject_invalid_chains(chains)
|
34
|
+
return chains if chains.empty?
|
35
|
+
highest_rate = chains.first.first
|
36
|
+
chains.keep_if { |c| c.first == highest_rate }
|
37
|
+
end
|
38
|
+
|
39
|
+
def rate(chain)
|
40
|
+
Estimate.new(@params, chain).estimate
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module Fias
|
2
|
+
module Query
|
3
|
+
class Estimate
|
4
|
+
def initialize(params, chain)
|
5
|
+
@params = params
|
6
|
+
@chain = chain
|
7
|
+
end
|
8
|
+
|
9
|
+
def estimate
|
10
|
+
for_subject +
|
11
|
+
for_found_parts +
|
12
|
+
for_type +
|
13
|
+
for_deepness +
|
14
|
+
for_name_proximity
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def for_subject
|
20
|
+
expected_type = @params.sanitized.keys.first
|
21
|
+
expected_type == @chain.first[:key] ? RATES[:subject] : 0
|
22
|
+
end
|
23
|
+
|
24
|
+
def for_found_parts
|
25
|
+
@chain.size * RATES[:found_part]
|
26
|
+
end
|
27
|
+
|
28
|
+
def for_type
|
29
|
+
@params.sanitized.sum do |key, (_, *expected_status)|
|
30
|
+
received_status = chain_by_key[key].try(:[], :abbr)
|
31
|
+
|
32
|
+
status_found =
|
33
|
+
expected_status.present? &&
|
34
|
+
expected_status.include?(received_status)
|
35
|
+
|
36
|
+
status_found ? RATES[:type] : 0
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def for_deepness
|
41
|
+
@chain.first[:ancestry].size * RATES[:deep]
|
42
|
+
end
|
43
|
+
|
44
|
+
def for_name_proximity
|
45
|
+
@params.synonyms.sum do |key, (expected, _)|
|
46
|
+
given = chain_by_key[key].try(:[], :tokens) || []
|
47
|
+
expected = expected.flatten.uniq
|
48
|
+
|
49
|
+
proximity = (given & expected).size
|
50
|
+
proximity * RATES[:name]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def chain_by_key
|
55
|
+
@chain_by_key ||= @chain.index_by { |item| item[:key] }
|
56
|
+
end
|
57
|
+
|
58
|
+
RATES = {
|
59
|
+
subject: 10000, # It's most important to match street if street is requested
|
60
|
+
found_part: 1000, # Than, maximum parts number should coincide
|
61
|
+
type: 100, # Than, status should coincide,
|
62
|
+
name: 5, # Than, how close name matches are
|
63
|
+
deep: -1 # Than, how deep is matching chain situated
|
64
|
+
}
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Fias
|
2
|
+
module Query
|
3
|
+
class Finder
|
4
|
+
def initialize(params, find)
|
5
|
+
@params = params
|
6
|
+
@find = find
|
7
|
+
end
|
8
|
+
|
9
|
+
def assumption
|
10
|
+
find_endpoints
|
11
|
+
return [] if @endpoints.blank?
|
12
|
+
reject_inconsistent_chains
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def find_endpoints
|
18
|
+
@endpoints = @params.split.keys.map do |key|
|
19
|
+
find_endpoint(key)
|
20
|
+
end
|
21
|
+
@endpoints = Hash[@endpoints]
|
22
|
+
inject_key_to_endpoints
|
23
|
+
end
|
24
|
+
|
25
|
+
def find_endpoint(key)
|
26
|
+
words = @params.split[key]
|
27
|
+
endpoints = find(words)
|
28
|
+
endpoints = reject_endpoints(endpoints, key)
|
29
|
+
[key, endpoints]
|
30
|
+
end
|
31
|
+
|
32
|
+
def find(words)
|
33
|
+
@find.call(words)
|
34
|
+
end
|
35
|
+
|
36
|
+
def reject_endpoints(endpoints, key)
|
37
|
+
forms = @params.forms[key]
|
38
|
+
|
39
|
+
endpoints.reject do |endpoint|
|
40
|
+
(forms & endpoint[:forms]).blank?
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def inject_key_to_endpoints
|
45
|
+
@endpoints.each do |key, endpoints|
|
46
|
+
endpoints.each { |endpoint| endpoint[:key] = key }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def reject_inconsistent_chains
|
51
|
+
starting_endpoints = @endpoints.values.first
|
52
|
+
parents = endpoints_parents
|
53
|
+
|
54
|
+
chains = starting_endpoints.map do |endpoint|
|
55
|
+
overlaps = parents.keys & endpoint[:ancestry]
|
56
|
+
|
57
|
+
if parents.blank? || overlaps.present?
|
58
|
+
[endpoint] + endpoint[:ancestry].map { |id| parents[id] }.compact
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
chains.compact
|
63
|
+
end
|
64
|
+
|
65
|
+
def endpoints_parents
|
66
|
+
parents = @endpoints.values.slice(1..-1)
|
67
|
+
return [] if parents.nil?
|
68
|
+
parents
|
69
|
+
.flatten
|
70
|
+
.reverse
|
71
|
+
.index_by { |endpoint| endpoint[:id] }
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|