fias 0.0.2 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -22
  3. data/.rubocop.yml +7 -0
  4. data/.travis.yml +10 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE.txt +2 -2
  7. data/README.md +259 -155
  8. data/Rakefile +6 -1
  9. data/config/names.txt +0 -0
  10. data/config/synonyms.yml +50 -0
  11. data/examples/create.rb +106 -0
  12. data/examples/generate_index.rb +63 -0
  13. data/fias.gemspec +33 -21
  14. data/lib/fias.rb +197 -10
  15. data/lib/fias/config.rb +74 -0
  16. data/lib/fias/import/copy.rb +62 -0
  17. data/lib/fias/import/dbf.rb +81 -0
  18. data/lib/fias/import/download_service.rb +37 -0
  19. data/lib/fias/import/restore_parent_id.rb +51 -0
  20. data/lib/fias/import/tables.rb +74 -0
  21. data/lib/fias/name/append.rb +30 -0
  22. data/lib/fias/name/canonical.rb +42 -0
  23. data/lib/fias/name/extract.rb +85 -0
  24. data/lib/fias/name/house_number.rb +71 -0
  25. data/lib/fias/name/split.rb +60 -0
  26. data/lib/fias/name/synonyms.rb +93 -0
  27. data/lib/fias/query.rb +43 -0
  28. data/lib/fias/query/estimate.rb +67 -0
  29. data/lib/fias/query/finder.rb +75 -0
  30. data/lib/fias/query/params.rb +101 -0
  31. data/lib/fias/railtie.rb +3 -17
  32. data/lib/fias/version.rb +1 -1
  33. data/spec/fixtures/ACTSTAT.DBF +0 -0
  34. data/spec/fixtures/NORDOC99.DBF +0 -0
  35. data/spec/fixtures/STRSTAT.DBF +0 -0
  36. data/spec/fixtures/addressing.yml +93 -0
  37. data/spec/fixtures/query.yml +79 -0
  38. data/spec/fixtures/query_sanitization.yml +75 -0
  39. data/spec/fixtures/status_append.yml +60 -0
  40. data/spec/lib/import/copy_spec.rb +44 -0
  41. data/spec/lib/import/dbf_spec.rb +28 -0
  42. data/spec/lib/import/download_service_spec.rb +15 -0
  43. data/spec/lib/import/restore_parent_id_spec.rb +34 -0
  44. data/spec/lib/import/tables_spec.rb +26 -0
  45. data/spec/lib/name/append_spec.rb +14 -0
  46. data/spec/lib/name/canonical_spec.rb +20 -0
  47. data/spec/lib/name/extract_spec.rb +67 -0
  48. data/spec/lib/name/house_number_spec.rb +45 -0
  49. data/spec/lib/name/query_spec.rb +21 -0
  50. data/spec/lib/name/split_spec.rb +15 -0
  51. data/spec/lib/name/synonyms_spec.rb +51 -0
  52. data/spec/lib/query/params_spec.rb +15 -0
  53. data/spec/lib/query_spec.rb +27 -0
  54. data/spec/spec_helper.rb +30 -0
  55. data/spec/support/db.rb +30 -0
  56. data/spec/support/query.rb +13 -0
  57. data/tasks/db.rake +52 -0
  58. data/tasks/download.rake +15 -0
  59. metadata +246 -64
  60. data/lib/fias/active_record/address_object.rb +0 -231
  61. data/lib/fias/active_record/address_object_type.rb +0 -15
  62. data/lib/fias/dbf_wrapper.rb +0 -90
  63. data/lib/fias/importer.rb +0 -30
  64. data/lib/fias/importer/base.rb +0 -59
  65. data/lib/fias/importer/pg.rb +0 -81
  66. data/lib/fias/importer/sqlite.rb +0 -38
  67. data/lib/generators/fias/migration.rb +0 -34
  68. data/lib/generators/fias/templates/create_fias_tables.rb +0 -5
  69. data/tasks/fias.rake +0 -68
@@ -0,0 +1,85 @@
1
+ module Fias
2
+ module Name
3
+ module Extract
4
+ class << self
5
+ def extract(name)
6
+ return if name.blank?
7
+ name = cleanup(name)
8
+
9
+ matches = find(name)
10
+ rates = assign_rates(name, matches)
11
+ winner = pick_winner(rates)
12
+ return [name] unless winner
13
+
14
+ extract_name(name, winner)
15
+ end
16
+
17
+ private
18
+
19
+ def cleanup(name)
20
+ name.split(' ').join(' ').strip
21
+ end
22
+
23
+ def find(name)
24
+ matches = Fias.config.index.keys.map do |query|
25
+ match = name.match(/(\s|^)(#{Regexp.escape(query)})(\.|\s|$)/ui)
26
+ match if match && match[2]
27
+ end
28
+ matches.compact
29
+ end
30
+
31
+ def assign_rates(name, matches)
32
+ matches.map { |match| rate_match(name, match) }
33
+ end
34
+
35
+ def rate_match(name, match)
36
+ short_name = match[2]
37
+
38
+ rate =
39
+ (ends_with_dot?(short_name) * REWARD[:dot]) +
40
+ (starts_with_small_letter?(short_name) * REWARD[:small_letter]) +
41
+ (border_proximity(name, match))
42
+
43
+ rate *= 100
44
+ rate += short_name.size
45
+
46
+ [rate, match]
47
+ end
48
+
49
+ def border_proximity(name, match)
50
+ head = name.size - match.begin(1) + REWARD[:head]
51
+ tail = match.end(2)
52
+ [head, tail].max
53
+ end
54
+
55
+ def ends_with_dot?(value)
56
+ value[-1] == '.' ? 1 : 0
57
+ end
58
+
59
+ def starts_with_small_letter?(value)
60
+ value[0] =~ SMALL_LETTER ? 1 : 0
61
+ end
62
+
63
+ def pick_winner(rates)
64
+ rates = rates.sort_by(&:first).reverse
65
+ rate, match = rates.first
66
+ return if (rates[1..-1] || []).any? { |(r, _)| rate == r }
67
+ match
68
+ end
69
+
70
+ def extract_name(name, winner)
71
+ short_name = winner[2]
72
+ toponym = cleanup(name.gsub(winner.regexp, ' '))
73
+ return [name] if toponym.strip.blank?
74
+ [cleanup(toponym), Canonical.canonical(short_name)].flatten
75
+ end
76
+
77
+ SMALL_LETTER = /[а-яё]/u
78
+
79
+ REWARD = {
80
+ dot: 3, small_letter: 2, head: 1
81
+ }
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,71 @@
1
+ module Fias
2
+ module Name
3
+ module HouseNumber
4
+ class << self
5
+ def extract(name)
6
+ return [name, nil] unless contains_number?(name)
7
+
8
+ name, number =
9
+ try_split_by_colon(name) ||
10
+ try_housing(name) ||
11
+ try_house_word(name) ||
12
+ try_ends_with_number(name)
13
+
14
+ [name.strip, number.strip]
15
+ end
16
+
17
+ private
18
+
19
+ def contains_number?(name)
20
+ !(name =~ JUST_A_NUMBER) && !(name =~ LINE_OR_MICRODISTRICT) &&
21
+ (
22
+ name =~ COLON ||
23
+ name =~ ENDS_WITH_NUMBER ||
24
+ name =~ HOUSE_WORD ||
25
+ name =~ NUMBER_WITH_HOUSING
26
+ )
27
+ end
28
+
29
+ def try_split_by_colon(name)
30
+ name.split(/\s*,\s*/, 2) if name =~ COLON
31
+ end
32
+
33
+ def try_housing(name)
34
+ match = name.match(NUMBER_WITH_HOUSING)
35
+ [match.pre_match, "#{match} #{match.post_match}"] if match
36
+ end
37
+
38
+ def try_house_word(name)
39
+ match = name.match(HOUSE_WORD)
40
+ [match.pre_match, match.post_match] if match
41
+ end
42
+
43
+ def try_ends_with_number(name)
44
+ match = name.match(ENDS_WITH_NUMBER)
45
+ [match.pre_match, match[1]] if match
46
+ end
47
+
48
+ def or_words(words)
49
+ words
50
+ .sort_by(&:length)
51
+ .reverse
52
+ .map { |w| Regexp.escape(w) }
53
+ .join('|')
54
+ end
55
+ end
56
+
57
+ COLON = /\,/
58
+ JUST_A_NUMBER = /^[\s\d]+$/
59
+ STOPWORDS = /(микрорайон|линия|микр|мкрн|мкр|лин)/ui
60
+ LINE_OR_MICRODISTRICT = /#{STOPWORDS}\.?[\s\w+]?\d+$/ui
61
+ NUMBER = /\d+\/?#{Fias::LETTERS}?\d*/ui
62
+ ENDS_WITH_NUMBER = /(#{NUMBER})$/ui
63
+ HOUSE_WORDS = %w(ом д дом вл кв)
64
+ HOUSE_WORD =
65
+ /(\s|\,|\.|^)(#{or_words(HOUSE_WORDS)})(\s|\,|\.|$)/ui
66
+ HOUSING_WORDS = %w(корпус корп к)
67
+ NUMBER_WITH_HOUSING =
68
+ /#{NUMBER}[\s\,\.]+(#{or_words(HOUSING_WORDS)})[\s\,\.]+#{NUMBER}/ui
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,60 @@
1
+ module Fias
2
+ module Name
3
+ module Split
4
+ class << self
5
+ def split(name)
6
+ words = sanitize(name).scan(Fias.word)
7
+ words = cleanup_brackets(words)
8
+ words = split_all_initials(words)
9
+ words = split_all_dotwords(words)
10
+
11
+ words
12
+ .reject(&:blank?)
13
+ .flatten
14
+ .uniq
15
+ end
16
+
17
+ private
18
+
19
+ def sanitize(name)
20
+ Unicode.downcase(name).gsub('ё', 'е').gsub(QUOTAS, '')
21
+ end
22
+
23
+ def cleanup_brackets(words)
24
+ words.map { |word, _| word.gsub(BRACKETS, '') }
25
+ end
26
+
27
+ def split_all_initials(words)
28
+ words
29
+ .map { |word, _| split_initials(word) || word }
30
+ .compact
31
+ .flatten
32
+ end
33
+
34
+ def split_initials(word)
35
+ m_matches = word.match(Fias::INITIALS)
36
+ return m_matches.values_at(1, 3) if m_matches
37
+
38
+ s_matches = word.match(Fias::SINGLE_INITIAL)
39
+ return s_matches.values_at(2, 3) if s_matches
40
+ end
41
+
42
+ def split_all_dotwords(words)
43
+ words
44
+ .map { |word, _| split_dotwords(word) || word }
45
+ .compact
46
+ end
47
+
48
+ def split_dotwords(word)
49
+ return unless word =~ DOTWORD
50
+ dotwords = word.gsub(DOTWORD, '\1 ')
51
+ dotwords.split(' ').uniq.delete_if(&:blank?)
52
+ end
53
+ end
54
+
55
+ DOTWORD = /(#{LETTERS}{2,}\.)/ui
56
+ BRACKETS = /(\(|\))/
57
+ QUOTAS = /[\"\']/
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,93 @@
1
+ module Fias
2
+ module Name
3
+ module Synonyms
4
+ class << self
5
+ def expand(name)
6
+ Split
7
+ .split(name)
8
+ .map { |token| Array.wrap(tokenize(name, token)) }
9
+ end
10
+
11
+ def tokens(name)
12
+ expand(name).flatten.uniq
13
+ end
14
+
15
+ def forms(name)
16
+ recombine(expand(name))
17
+ end
18
+
19
+ private
20
+
21
+ def tokenize(name, token)
22
+ synonyms(token) ||
23
+ bracketed(name, token) ||
24
+ proper_names(token) ||
25
+ initials(token) ||
26
+ annivesary(token) ||
27
+ numerals(token) ||
28
+ token
29
+ end
30
+
31
+ def synonyms(token)
32
+ Fias.config.synonyms_index[token]
33
+ end
34
+
35
+ def bracketed(name, token)
36
+ match = name.match(IN_BRACKETS)
37
+ [token, OPTIONAL] if match && match[1].include?(token)
38
+ end
39
+
40
+ def proper_names(token)
41
+ [token, OPTIONAL] if Fias.config.proper_names.include?(token)
42
+ end
43
+
44
+ def initials(token)
45
+ return unless
46
+ (Fias::INITIALS =~ token) && (Fias::SINGLE_INITIAL =~ token)
47
+
48
+ [token, OPTIONAL]
49
+ end
50
+
51
+ def annivesary(token)
52
+ return unless token =~ Fias::ANNIVESARIES
53
+
54
+ ANNIVESARY_FORMS.map do |form|
55
+ token.gsub(Fias::ANNIVESARIES, form)
56
+ end
57
+ end
58
+
59
+ def numerals(token)
60
+ return unless (/^\d+/ =~ token) || (Fias::ANNIVESARIES =~ token)
61
+ numerals_for(token)
62
+ end
63
+
64
+ def numerals_for(numeral)
65
+ n = numeral.gsub(/[^\d]/, '')
66
+
67
+ suffixes =
68
+ NUMERAL_SUFFIXES.map do |suffix|
69
+ ["#{n}#{suffix}", "#{n}-#{suffix}"]
70
+ end
71
+
72
+ suffixes.flatten + [n]
73
+ end
74
+
75
+ def recombine(variants)
76
+ return variants if variants.empty?
77
+ head, *rest = variants
78
+
79
+ forms = head.product(*rest)
80
+ forms
81
+ .map { |variant| variant.reject(&:blank?).sort.join(' ') }
82
+ .flatten
83
+ end
84
+
85
+ IN_BRACKETS = /\((.*)\)/
86
+ OPTIONAL = ''
87
+ NUMERAL_SUFFIXES = %w(й я е ая ий ый ой ые ое го)
88
+ ANNIVESARY_FORMS =
89
+ ['\1-летия', '\1-лет', '\1 летия', '\1 лет', '\1-летие', '\1 летие']
90
+ end
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,43 @@
1
+ module Fias
2
+ module Query
3
+ def initialize(params)
4
+ @params = Params.new(params)
5
+ @finder = Finder.new(@params, method(:find))
6
+ end
7
+
8
+ attr_reader :params
9
+
10
+ def perform
11
+ assumption = @finder.assumption
12
+ estimate(assumption)
13
+ end
14
+
15
+ protected
16
+
17
+ def find(_tokens)
18
+ fail NotImplementedError
19
+ end
20
+
21
+ def estimate(assumption)
22
+ chains = estimate_chains(assumption)
23
+ reject_invalid_chains(chains)
24
+ end
25
+
26
+ def estimate_chains(assumption)
27
+ assumption
28
+ .map { |chain| [rate(chain), chain.first] }
29
+ .sort_by(&:first)
30
+ .reverse
31
+ end
32
+
33
+ def reject_invalid_chains(chains)
34
+ return chains if chains.empty?
35
+ highest_rate = chains.first.first
36
+ chains.keep_if { |c| c.first == highest_rate }
37
+ end
38
+
39
+ def rate(chain)
40
+ Estimate.new(@params, chain).estimate
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,67 @@
1
+ module Fias
2
+ module Query
3
+ class Estimate
4
+ def initialize(params, chain)
5
+ @params = params
6
+ @chain = chain
7
+ end
8
+
9
+ def estimate
10
+ for_subject +
11
+ for_found_parts +
12
+ for_type +
13
+ for_deepness +
14
+ for_name_proximity
15
+ end
16
+
17
+ private
18
+
19
+ def for_subject
20
+ expected_type = @params.sanitized.keys.first
21
+ expected_type == @chain.first[:key] ? RATES[:subject] : 0
22
+ end
23
+
24
+ def for_found_parts
25
+ @chain.size * RATES[:found_part]
26
+ end
27
+
28
+ def for_type
29
+ @params.sanitized.sum do |key, (_, *expected_status)|
30
+ received_status = chain_by_key[key].try(:[], :abbr)
31
+
32
+ status_found =
33
+ expected_status.present? &&
34
+ expected_status.include?(received_status)
35
+
36
+ status_found ? RATES[:type] : 0
37
+ end
38
+ end
39
+
40
+ def for_deepness
41
+ @chain.first[:ancestry].size * RATES[:deep]
42
+ end
43
+
44
+ def for_name_proximity
45
+ @params.synonyms.sum do |key, (expected, _)|
46
+ given = chain_by_key[key].try(:[], :tokens) || []
47
+ expected = expected.flatten.uniq
48
+
49
+ proximity = (given & expected).size
50
+ proximity * RATES[:name]
51
+ end
52
+ end
53
+
54
+ def chain_by_key
55
+ @chain_by_key ||= @chain.index_by { |item| item[:key] }
56
+ end
57
+
58
+ RATES = {
59
+ subject: 10000, # It's most important to match street if street is requested
60
+ found_part: 1000, # Than, maximum parts number should coincide
61
+ type: 100, # Than, status should coincide,
62
+ name: 5, # Than, how close name matches are
63
+ deep: -1 # Than, how deep is matching chain situated
64
+ }
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,75 @@
1
+ module Fias
2
+ module Query
3
+ class Finder
4
+ def initialize(params, find)
5
+ @params = params
6
+ @find = find
7
+ end
8
+
9
+ def assumption
10
+ find_endpoints
11
+ return [] if @endpoints.blank?
12
+ reject_inconsistent_chains
13
+ end
14
+
15
+ private
16
+
17
+ def find_endpoints
18
+ @endpoints = @params.split.keys.map do |key|
19
+ find_endpoint(key)
20
+ end
21
+ @endpoints = Hash[@endpoints]
22
+ inject_key_to_endpoints
23
+ end
24
+
25
+ def find_endpoint(key)
26
+ words = @params.split[key]
27
+ endpoints = find(words)
28
+ endpoints = reject_endpoints(endpoints, key)
29
+ [key, endpoints]
30
+ end
31
+
32
+ def find(words)
33
+ @find.call(words)
34
+ end
35
+
36
+ def reject_endpoints(endpoints, key)
37
+ forms = @params.forms[key]
38
+
39
+ endpoints.reject do |endpoint|
40
+ (forms & endpoint[:forms]).blank?
41
+ end
42
+ end
43
+
44
+ def inject_key_to_endpoints
45
+ @endpoints.each do |key, endpoints|
46
+ endpoints.each { |endpoint| endpoint[:key] = key }
47
+ end
48
+ end
49
+
50
+ def reject_inconsistent_chains
51
+ starting_endpoints = @endpoints.values.first
52
+ parents = endpoints_parents
53
+
54
+ chains = starting_endpoints.map do |endpoint|
55
+ overlaps = parents.keys & endpoint[:ancestry]
56
+
57
+ if parents.blank? || overlaps.present?
58
+ [endpoint] + endpoint[:ancestry].map { |id| parents[id] }.compact
59
+ end
60
+ end
61
+
62
+ chains.compact
63
+ end
64
+
65
+ def endpoints_parents
66
+ parents = @endpoints.values.slice(1..-1)
67
+ return [] if parents.nil?
68
+ parents
69
+ .flatten
70
+ .reverse
71
+ .index_by { |endpoint| endpoint[:id] }
72
+ end
73
+ end
74
+ end
75
+ end