fias 0.0.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -22
  3. data/.rubocop.yml +7 -0
  4. data/.travis.yml +10 -0
  5. data/Gemfile +1 -1
  6. data/LICENSE.txt +2 -2
  7. data/README.md +259 -155
  8. data/Rakefile +6 -1
  9. data/config/names.txt +0 -0
  10. data/config/synonyms.yml +50 -0
  11. data/examples/create.rb +106 -0
  12. data/examples/generate_index.rb +63 -0
  13. data/fias.gemspec +33 -21
  14. data/lib/fias.rb +197 -10
  15. data/lib/fias/config.rb +74 -0
  16. data/lib/fias/import/copy.rb +62 -0
  17. data/lib/fias/import/dbf.rb +81 -0
  18. data/lib/fias/import/download_service.rb +37 -0
  19. data/lib/fias/import/restore_parent_id.rb +51 -0
  20. data/lib/fias/import/tables.rb +74 -0
  21. data/lib/fias/name/append.rb +30 -0
  22. data/lib/fias/name/canonical.rb +42 -0
  23. data/lib/fias/name/extract.rb +85 -0
  24. data/lib/fias/name/house_number.rb +71 -0
  25. data/lib/fias/name/split.rb +60 -0
  26. data/lib/fias/name/synonyms.rb +93 -0
  27. data/lib/fias/query.rb +43 -0
  28. data/lib/fias/query/estimate.rb +67 -0
  29. data/lib/fias/query/finder.rb +75 -0
  30. data/lib/fias/query/params.rb +101 -0
  31. data/lib/fias/railtie.rb +3 -17
  32. data/lib/fias/version.rb +1 -1
  33. data/spec/fixtures/ACTSTAT.DBF +0 -0
  34. data/spec/fixtures/NORDOC99.DBF +0 -0
  35. data/spec/fixtures/STRSTAT.DBF +0 -0
  36. data/spec/fixtures/addressing.yml +93 -0
  37. data/spec/fixtures/query.yml +79 -0
  38. data/spec/fixtures/query_sanitization.yml +75 -0
  39. data/spec/fixtures/status_append.yml +60 -0
  40. data/spec/lib/import/copy_spec.rb +44 -0
  41. data/spec/lib/import/dbf_spec.rb +28 -0
  42. data/spec/lib/import/download_service_spec.rb +15 -0
  43. data/spec/lib/import/restore_parent_id_spec.rb +34 -0
  44. data/spec/lib/import/tables_spec.rb +26 -0
  45. data/spec/lib/name/append_spec.rb +14 -0
  46. data/spec/lib/name/canonical_spec.rb +20 -0
  47. data/spec/lib/name/extract_spec.rb +67 -0
  48. data/spec/lib/name/house_number_spec.rb +45 -0
  49. data/spec/lib/name/query_spec.rb +21 -0
  50. data/spec/lib/name/split_spec.rb +15 -0
  51. data/spec/lib/name/synonyms_spec.rb +51 -0
  52. data/spec/lib/query/params_spec.rb +15 -0
  53. data/spec/lib/query_spec.rb +27 -0
  54. data/spec/spec_helper.rb +30 -0
  55. data/spec/support/db.rb +30 -0
  56. data/spec/support/query.rb +13 -0
  57. data/tasks/db.rake +52 -0
  58. data/tasks/download.rake +15 -0
  59. metadata +246 -64
  60. data/lib/fias/active_record/address_object.rb +0 -231
  61. data/lib/fias/active_record/address_object_type.rb +0 -15
  62. data/lib/fias/dbf_wrapper.rb +0 -90
  63. data/lib/fias/importer.rb +0 -30
  64. data/lib/fias/importer/base.rb +0 -59
  65. data/lib/fias/importer/pg.rb +0 -81
  66. data/lib/fias/importer/sqlite.rb +0 -38
  67. data/lib/generators/fias/migration.rb +0 -34
  68. data/lib/generators/fias/templates/create_fias_tables.rb +0 -5
  69. data/tasks/fias.rake +0 -68
@@ -0,0 +1,85 @@
1
+ module Fias
2
+ module Name
3
+ module Extract
4
+ class << self
5
+ def extract(name)
6
+ return if name.blank?
7
+ name = cleanup(name)
8
+
9
+ matches = find(name)
10
+ rates = assign_rates(name, matches)
11
+ winner = pick_winner(rates)
12
+ return [name] unless winner
13
+
14
+ extract_name(name, winner)
15
+ end
16
+
17
+ private
18
+
19
+ def cleanup(name)
20
+ name.split(' ').join(' ').strip
21
+ end
22
+
23
+ def find(name)
24
+ matches = Fias.config.index.keys.map do |query|
25
+ match = name.match(/(\s|^)(#{Regexp.escape(query)})(\.|\s|$)/ui)
26
+ match if match && match[2]
27
+ end
28
+ matches.compact
29
+ end
30
+
31
+ def assign_rates(name, matches)
32
+ matches.map { |match| rate_match(name, match) }
33
+ end
34
+
35
+ def rate_match(name, match)
36
+ short_name = match[2]
37
+
38
+ rate =
39
+ (ends_with_dot?(short_name) * REWARD[:dot]) +
40
+ (starts_with_small_letter?(short_name) * REWARD[:small_letter]) +
41
+ (border_proximity(name, match))
42
+
43
+ rate *= 100
44
+ rate += short_name.size
45
+
46
+ [rate, match]
47
+ end
48
+
49
+ def border_proximity(name, match)
50
+ head = name.size - match.begin(1) + REWARD[:head]
51
+ tail = match.end(2)
52
+ [head, tail].max
53
+ end
54
+
55
+ def ends_with_dot?(value)
56
+ value[-1] == '.' ? 1 : 0
57
+ end
58
+
59
+ def starts_with_small_letter?(value)
60
+ value[0] =~ SMALL_LETTER ? 1 : 0
61
+ end
62
+
63
+ def pick_winner(rates)
64
+ rates = rates.sort_by(&:first).reverse
65
+ rate, match = rates.first
66
+ return if (rates[1..-1] || []).any? { |(r, _)| rate == r }
67
+ match
68
+ end
69
+
70
+ def extract_name(name, winner)
71
+ short_name = winner[2]
72
+ toponym = cleanup(name.gsub(winner.regexp, ' '))
73
+ return [name] if toponym.strip.blank?
74
+ [cleanup(toponym), Canonical.canonical(short_name)].flatten
75
+ end
76
+
77
+ SMALL_LETTER = /[а-яё]/u
78
+
79
+ REWARD = {
80
+ dot: 3, small_letter: 2, head: 1
81
+ }
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,71 @@
1
+ module Fias
2
+ module Name
3
+ module HouseNumber
4
+ class << self
5
+ def extract(name)
6
+ return [name, nil] unless contains_number?(name)
7
+
8
+ name, number =
9
+ try_split_by_colon(name) ||
10
+ try_housing(name) ||
11
+ try_house_word(name) ||
12
+ try_ends_with_number(name)
13
+
14
+ [name.strip, number.strip]
15
+ end
16
+
17
+ private
18
+
19
+ def contains_number?(name)
20
+ !(name =~ JUST_A_NUMBER) && !(name =~ LINE_OR_MICRODISTRICT) &&
21
+ (
22
+ name =~ COLON ||
23
+ name =~ ENDS_WITH_NUMBER ||
24
+ name =~ HOUSE_WORD ||
25
+ name =~ NUMBER_WITH_HOUSING
26
+ )
27
+ end
28
+
29
+ def try_split_by_colon(name)
30
+ name.split(/\s*,\s*/, 2) if name =~ COLON
31
+ end
32
+
33
+ def try_housing(name)
34
+ match = name.match(NUMBER_WITH_HOUSING)
35
+ [match.pre_match, "#{match} #{match.post_match}"] if match
36
+ end
37
+
38
+ def try_house_word(name)
39
+ match = name.match(HOUSE_WORD)
40
+ [match.pre_match, match.post_match] if match
41
+ end
42
+
43
+ def try_ends_with_number(name)
44
+ match = name.match(ENDS_WITH_NUMBER)
45
+ [match.pre_match, match[1]] if match
46
+ end
47
+
48
+ def or_words(words)
49
+ words
50
+ .sort_by(&:length)
51
+ .reverse
52
+ .map { |w| Regexp.escape(w) }
53
+ .join('|')
54
+ end
55
+ end
56
+
57
+ COLON = /\,/
58
+ JUST_A_NUMBER = /^[\s\d]+$/
59
+ STOPWORDS = /(микрорайон|линия|микр|мкрн|мкр|лин)/ui
60
+ LINE_OR_MICRODISTRICT = /#{STOPWORDS}\.?[\s\w+]?\d+$/ui
61
+ NUMBER = /\d+\/?#{Fias::LETTERS}?\d*/ui
62
+ ENDS_WITH_NUMBER = /(#{NUMBER})$/ui
63
+ HOUSE_WORDS = %w(ом д дом вл кв)
64
+ HOUSE_WORD =
65
+ /(\s|\,|\.|^)(#{or_words(HOUSE_WORDS)})(\s|\,|\.|$)/ui
66
+ HOUSING_WORDS = %w(корпус корп к)
67
+ NUMBER_WITH_HOUSING =
68
+ /#{NUMBER}[\s\,\.]+(#{or_words(HOUSING_WORDS)})[\s\,\.]+#{NUMBER}/ui
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,60 @@
1
+ module Fias
2
+ module Name
3
+ module Split
4
+ class << self
5
+ def split(name)
6
+ words = sanitize(name).scan(Fias.word)
7
+ words = cleanup_brackets(words)
8
+ words = split_all_initials(words)
9
+ words = split_all_dotwords(words)
10
+
11
+ words
12
+ .reject(&:blank?)
13
+ .flatten
14
+ .uniq
15
+ end
16
+
17
+ private
18
+
19
+ def sanitize(name)
20
+ Unicode.downcase(name).gsub('ё', 'е').gsub(QUOTAS, '')
21
+ end
22
+
23
+ def cleanup_brackets(words)
24
+ words.map { |word, _| word.gsub(BRACKETS, '') }
25
+ end
26
+
27
+ def split_all_initials(words)
28
+ words
29
+ .map { |word, _| split_initials(word) || word }
30
+ .compact
31
+ .flatten
32
+ end
33
+
34
+ def split_initials(word)
35
+ m_matches = word.match(Fias::INITIALS)
36
+ return m_matches.values_at(1, 3) if m_matches
37
+
38
+ s_matches = word.match(Fias::SINGLE_INITIAL)
39
+ return s_matches.values_at(2, 3) if s_matches
40
+ end
41
+
42
+ def split_all_dotwords(words)
43
+ words
44
+ .map { |word, _| split_dotwords(word) || word }
45
+ .compact
46
+ end
47
+
48
+ def split_dotwords(word)
49
+ return unless word =~ DOTWORD
50
+ dotwords = word.gsub(DOTWORD, '\1 ')
51
+ dotwords.split(' ').uniq.delete_if(&:blank?)
52
+ end
53
+ end
54
+
55
+ DOTWORD = /(#{LETTERS}{2,}\.)/ui
56
+ BRACKETS = /(\(|\))/
57
+ QUOTAS = /[\"\']/
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,93 @@
1
+ module Fias
2
+ module Name
3
+ module Synonyms
4
+ class << self
5
+ def expand(name)
6
+ Split
7
+ .split(name)
8
+ .map { |token| Array.wrap(tokenize(name, token)) }
9
+ end
10
+
11
+ def tokens(name)
12
+ expand(name).flatten.uniq
13
+ end
14
+
15
+ def forms(name)
16
+ recombine(expand(name))
17
+ end
18
+
19
+ private
20
+
21
+ def tokenize(name, token)
22
+ synonyms(token) ||
23
+ bracketed(name, token) ||
24
+ proper_names(token) ||
25
+ initials(token) ||
26
+ annivesary(token) ||
27
+ numerals(token) ||
28
+ token
29
+ end
30
+
31
+ def synonyms(token)
32
+ Fias.config.synonyms_index[token]
33
+ end
34
+
35
+ def bracketed(name, token)
36
+ match = name.match(IN_BRACKETS)
37
+ [token, OPTIONAL] if match && match[1].include?(token)
38
+ end
39
+
40
+ def proper_names(token)
41
+ [token, OPTIONAL] if Fias.config.proper_names.include?(token)
42
+ end
43
+
44
+ def initials(token)
45
+ return unless
46
+ (Fias::INITIALS =~ token) && (Fias::SINGLE_INITIAL =~ token)
47
+
48
+ [token, OPTIONAL]
49
+ end
50
+
51
+ def annivesary(token)
52
+ return unless token =~ Fias::ANNIVESARIES
53
+
54
+ ANNIVESARY_FORMS.map do |form|
55
+ token.gsub(Fias::ANNIVESARIES, form)
56
+ end
57
+ end
58
+
59
+ def numerals(token)
60
+ return unless (/^\d+/ =~ token) || (Fias::ANNIVESARIES =~ token)
61
+ numerals_for(token)
62
+ end
63
+
64
+ def numerals_for(numeral)
65
+ n = numeral.gsub(/[^\d]/, '')
66
+
67
+ suffixes =
68
+ NUMERAL_SUFFIXES.map do |suffix|
69
+ ["#{n}#{suffix}", "#{n}-#{suffix}"]
70
+ end
71
+
72
+ suffixes.flatten + [n]
73
+ end
74
+
75
+ def recombine(variants)
76
+ return variants if variants.empty?
77
+ head, *rest = variants
78
+
79
+ forms = head.product(*rest)
80
+ forms
81
+ .map { |variant| variant.reject(&:blank?).sort.join(' ') }
82
+ .flatten
83
+ end
84
+
85
+ IN_BRACKETS = /\((.*)\)/
86
+ OPTIONAL = ''
87
+ NUMERAL_SUFFIXES = %w(й я е ая ий ый ой ые ое го)
88
+ ANNIVESARY_FORMS =
89
+ ['\1-летия', '\1-лет', '\1 летия', '\1 лет', '\1-летие', '\1 летие']
90
+ end
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,43 @@
1
+ module Fias
2
+ module Query
3
+ def initialize(params)
4
+ @params = Params.new(params)
5
+ @finder = Finder.new(@params, method(:find))
6
+ end
7
+
8
+ attr_reader :params
9
+
10
+ def perform
11
+ assumption = @finder.assumption
12
+ estimate(assumption)
13
+ end
14
+
15
+ protected
16
+
17
+ def find(_tokens)
18
+ fail NotImplementedError
19
+ end
20
+
21
+ def estimate(assumption)
22
+ chains = estimate_chains(assumption)
23
+ reject_invalid_chains(chains)
24
+ end
25
+
26
+ def estimate_chains(assumption)
27
+ assumption
28
+ .map { |chain| [rate(chain), chain.first] }
29
+ .sort_by(&:first)
30
+ .reverse
31
+ end
32
+
33
+ def reject_invalid_chains(chains)
34
+ return chains if chains.empty?
35
+ highest_rate = chains.first.first
36
+ chains.keep_if { |c| c.first == highest_rate }
37
+ end
38
+
39
+ def rate(chain)
40
+ Estimate.new(@params, chain).estimate
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,67 @@
1
+ module Fias
2
+ module Query
3
+ class Estimate
4
+ def initialize(params, chain)
5
+ @params = params
6
+ @chain = chain
7
+ end
8
+
9
+ def estimate
10
+ for_subject +
11
+ for_found_parts +
12
+ for_type +
13
+ for_deepness +
14
+ for_name_proximity
15
+ end
16
+
17
+ private
18
+
19
+ def for_subject
20
+ expected_type = @params.sanitized.keys.first
21
+ expected_type == @chain.first[:key] ? RATES[:subject] : 0
22
+ end
23
+
24
+ def for_found_parts
25
+ @chain.size * RATES[:found_part]
26
+ end
27
+
28
+ def for_type
29
+ @params.sanitized.sum do |key, (_, *expected_status)|
30
+ received_status = chain_by_key[key].try(:[], :abbr)
31
+
32
+ status_found =
33
+ expected_status.present? &&
34
+ expected_status.include?(received_status)
35
+
36
+ status_found ? RATES[:type] : 0
37
+ end
38
+ end
39
+
40
+ def for_deepness
41
+ @chain.first[:ancestry].size * RATES[:deep]
42
+ end
43
+
44
+ def for_name_proximity
45
+ @params.synonyms.sum do |key, (expected, _)|
46
+ given = chain_by_key[key].try(:[], :tokens) || []
47
+ expected = expected.flatten.uniq
48
+
49
+ proximity = (given & expected).size
50
+ proximity * RATES[:name]
51
+ end
52
+ end
53
+
54
+ def chain_by_key
55
+ @chain_by_key ||= @chain.index_by { |item| item[:key] }
56
+ end
57
+
58
+ RATES = {
59
+ subject: 10000, # It's most important to match street if street is requested
60
+ found_part: 1000, # Than, maximum parts number should coincide
61
+ type: 100, # Than, status should coincide,
62
+ name: 5, # Than, how close name matches are
63
+ deep: -1 # Than, how deep is matching chain situated
64
+ }
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,75 @@
1
+ module Fias
2
+ module Query
3
+ class Finder
4
+ def initialize(params, find)
5
+ @params = params
6
+ @find = find
7
+ end
8
+
9
+ def assumption
10
+ find_endpoints
11
+ return [] if @endpoints.blank?
12
+ reject_inconsistent_chains
13
+ end
14
+
15
+ private
16
+
17
+ def find_endpoints
18
+ @endpoints = @params.split.keys.map do |key|
19
+ find_endpoint(key)
20
+ end
21
+ @endpoints = Hash[@endpoints]
22
+ inject_key_to_endpoints
23
+ end
24
+
25
+ def find_endpoint(key)
26
+ words = @params.split[key]
27
+ endpoints = find(words)
28
+ endpoints = reject_endpoints(endpoints, key)
29
+ [key, endpoints]
30
+ end
31
+
32
+ def find(words)
33
+ @find.call(words)
34
+ end
35
+
36
+ def reject_endpoints(endpoints, key)
37
+ forms = @params.forms[key]
38
+
39
+ endpoints.reject do |endpoint|
40
+ (forms & endpoint[:forms]).blank?
41
+ end
42
+ end
43
+
44
+ def inject_key_to_endpoints
45
+ @endpoints.each do |key, endpoints|
46
+ endpoints.each { |endpoint| endpoint[:key] = key }
47
+ end
48
+ end
49
+
50
+ def reject_inconsistent_chains
51
+ starting_endpoints = @endpoints.values.first
52
+ parents = endpoints_parents
53
+
54
+ chains = starting_endpoints.map do |endpoint|
55
+ overlaps = parents.keys & endpoint[:ancestry]
56
+
57
+ if parents.blank? || overlaps.present?
58
+ [endpoint] + endpoint[:ancestry].map { |id| parents[id] }.compact
59
+ end
60
+ end
61
+
62
+ chains.compact
63
+ end
64
+
65
+ def endpoints_parents
66
+ parents = @endpoints.values.slice(1..-1)
67
+ return [] if parents.nil?
68
+ parents
69
+ .flatten
70
+ .reverse
71
+ .index_by { |endpoint| endpoint[:id] }
72
+ end
73
+ end
74
+ end
75
+ end