maxmind-db 1.0.0.beta

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +4 -0
  3. data/LICENSE-APACHE +202 -0
  4. data/LICENSE-MIT +17 -0
  5. data/README.dev.md +30 -0
  6. data/README.md +54 -0
  7. data/Rakefile +12 -0
  8. data/bin/mmdb-benchmark.rb +61 -0
  9. data/lib/maxmind/db.rb +274 -0
  10. data/lib/maxmind/db/decoder.rb +232 -0
  11. data/lib/maxmind/db/errors.rb +8 -0
  12. data/lib/maxmind/db/file_reader.rb +37 -0
  13. data/lib/maxmind/db/memory_reader.rb +24 -0
  14. data/lib/maxmind/db/metadata.rb +61 -0
  15. data/maxmind-db.gemspec +19 -0
  16. data/test/data/LICENSE +4 -0
  17. data/test/data/MaxMind-DB-spec.md +558 -0
  18. data/test/data/MaxMind-DB-test-metadata-pointers.mmdb +0 -0
  19. data/test/data/README.md +4 -0
  20. data/test/data/bad-data/README.md +7 -0
  21. data/test/data/bad-data/libmaxminddb/libmaxminddb-offset-integer-overflow.mmdb +0 -0
  22. data/test/data/bad-data/maxminddb-golang/cyclic-data-structure.mmdb +0 -0
  23. data/test/data/bad-data/maxminddb-golang/invalid-bytes-length.mmdb +1 -0
  24. data/test/data/bad-data/maxminddb-golang/invalid-data-record-offset.mmdb +0 -0
  25. data/test/data/bad-data/maxminddb-golang/invalid-map-key-length.mmdb +0 -0
  26. data/test/data/bad-data/maxminddb-golang/invalid-string-length.mmdb +1 -0
  27. data/test/data/bad-data/maxminddb-golang/metadata-is-an-uint128.mmdb +1 -0
  28. data/test/data/bad-data/maxminddb-golang/unexpected-bytes.mmdb +0 -0
  29. data/test/data/perltidyrc +12 -0
  30. data/test/data/source-data/GeoIP2-Anonymous-IP-Test.json +41 -0
  31. data/test/data/source-data/GeoIP2-City-Test.json +12852 -0
  32. data/test/data/source-data/GeoIP2-Connection-Type-Test.json +102 -0
  33. data/test/data/source-data/GeoIP2-Country-Test.json +11347 -0
  34. data/test/data/source-data/GeoIP2-DensityIncome-Test.json +14 -0
  35. data/test/data/source-data/GeoIP2-Domain-Test.json +452 -0
  36. data/test/data/source-data/GeoIP2-Enterprise-Test.json +673 -0
  37. data/test/data/source-data/GeoIP2-ISP-Test.json +12585 -0
  38. data/test/data/source-data/GeoIP2-Precision-Enterprise-Test.json +1598 -0
  39. data/test/data/source-data/GeoIP2-User-Count-Test.json +2824 -0
  40. data/test/data/source-data/GeoLite2-ASN-Test.json +37 -0
  41. data/test/data/source-data/README +15 -0
  42. data/test/data/test-data/GeoIP2-Anonymous-IP-Test.mmdb +0 -0
  43. data/test/data/test-data/GeoIP2-City-Test-Broken-Double-Format.mmdb +0 -0
  44. data/test/data/test-data/GeoIP2-City-Test-Invalid-Node-Count.mmdb +0 -0
  45. data/test/data/test-data/GeoIP2-City-Test.mmdb +0 -0
  46. data/test/data/test-data/GeoIP2-Connection-Type-Test.mmdb +0 -0
  47. data/test/data/test-data/GeoIP2-Country-Test.mmdb +0 -0
  48. data/test/data/test-data/GeoIP2-DensityIncome-Test.mmdb +0 -0
  49. data/test/data/test-data/GeoIP2-Domain-Test.mmdb +0 -0
  50. data/test/data/test-data/GeoIP2-Enterprise-Test.mmdb +0 -0
  51. data/test/data/test-data/GeoIP2-ISP-Test.mmdb +0 -0
  52. data/test/data/test-data/GeoIP2-Precision-Enterprise-Test.mmdb +0 -0
  53. data/test/data/test-data/GeoIP2-User-Count-Test.mmdb +0 -0
  54. data/test/data/test-data/GeoLite2-ASN-Test.mmdb +0 -0
  55. data/test/data/test-data/MaxMind-DB-no-ipv4-search-tree.mmdb +0 -0
  56. data/test/data/test-data/MaxMind-DB-string-value-entries.mmdb +0 -0
  57. data/test/data/test-data/MaxMind-DB-test-broken-pointers-24.mmdb +0 -0
  58. data/test/data/test-data/MaxMind-DB-test-broken-search-tree-24.mmdb +0 -0
  59. data/test/data/test-data/MaxMind-DB-test-decoder.mmdb +0 -0
  60. data/test/data/test-data/MaxMind-DB-test-ipv4-24.mmdb +0 -0
  61. data/test/data/test-data/MaxMind-DB-test-ipv4-28.mmdb +0 -0
  62. data/test/data/test-data/MaxMind-DB-test-ipv4-32.mmdb +0 -0
  63. data/test/data/test-data/MaxMind-DB-test-ipv6-24.mmdb +0 -0
  64. data/test/data/test-data/MaxMind-DB-test-ipv6-28.mmdb +0 -0
  65. data/test/data/test-data/MaxMind-DB-test-ipv6-32.mmdb +0 -0
  66. data/test/data/test-data/MaxMind-DB-test-metadata-pointers.mmdb +0 -0
  67. data/test/data/test-data/MaxMind-DB-test-mixed-24.mmdb +0 -0
  68. data/test/data/test-data/MaxMind-DB-test-mixed-28.mmdb +0 -0
  69. data/test/data/test-data/MaxMind-DB-test-mixed-32.mmdb +0 -0
  70. data/test/data/test-data/MaxMind-DB-test-nested.mmdb +0 -0
  71. data/test/data/test-data/README.md +26 -0
  72. data/test/data/test-data/maps-with-pointers.raw +0 -0
  73. data/test/data/test-data/write-test-data.pl +620 -0
  74. data/test/data/tidyall.ini +5 -0
  75. data/test/mmdb_util.rb +24 -0
  76. data/test/test_decoder.rb +241 -0
  77. data/test/test_reader.rb +415 -0
  78. metadata +126 -0
@@ -0,0 +1,232 @@
1
+ require 'maxmind/db/errors'
2
+
3
+ module MaxMind # :nodoc:
4
+ class DB
5
+ # +Decoder+ decodes a {MaxMind DB}[http://maxmind.github.io/MaxMind-DB/]
6
+ # data section.
7
+ #
8
+ # Typically you will interact with this class through a Reader rather than
9
+ # directly.
10
+ class Decoder # :nodoc:
11
+ # Create a +Decoder+.
12
+ #
13
+ # +io+ is the DB. It must provide a +read+ method. It must be opened in
14
+ # binary mode.
15
+ #
16
+ # +pointer_base+ is the base number to use when decoding a pointer. It is
17
+ # where the data section begins rather than the beginning of the file.
18
+ # The specification states the formula in the `Data Section Separator'
19
+ # section.
20
+ #
21
+ # +pointer_test+ is used for testing pointer code.
22
+ def initialize(io, pointer_base = 0, pointer_test = false)
23
+ @io = io
24
+ @pointer_base = pointer_base
25
+ @pointer_test = pointer_test
26
+ end
27
+
28
+ private
29
+
30
+ def decode_array(size, offset)
31
+ array = []
32
+ size.times do
33
+ value, offset = decode(offset)
34
+ array << value
35
+ end
36
+ [array, offset]
37
+ end
38
+
39
+ def decode_boolean(size, offset)
40
+ [size != 0, offset]
41
+ end
42
+
43
+ def decode_bytes(size, offset)
44
+ [@io.read(offset, size), offset + size]
45
+ end
46
+
47
+ def decode_double(size, offset)
48
+ verify_size(8, size)
49
+ buf = @io.read(offset, 8)
50
+ [buf.unpack('G'.freeze)[0], offset + 8]
51
+ end
52
+
53
+ def decode_float(size, offset)
54
+ verify_size(4, size)
55
+ buf = @io.read(offset, 4)
56
+ [buf.unpack('g'.freeze)[0], offset + 4]
57
+ end
58
+
59
+ def verify_size(expected, actual)
60
+ return if expected == actual
61
+ raise InvalidDatabaseError,
62
+ 'The MaxMind DB file\'s data section contains bad data (unknown data type or corrupt data)'.freeze
63
+ end
64
+
65
+ def decode_int32(size, offset)
66
+ decode_int('l>'.freeze, 4, size, offset)
67
+ end
68
+
69
+ def decode_uint16(size, offset)
70
+ decode_int('n'.freeze, 2, size, offset)
71
+ end
72
+
73
+ def decode_uint32(size, offset)
74
+ decode_int('N'.freeze, 4, size, offset)
75
+ end
76
+
77
+ def decode_uint64(size, offset)
78
+ decode_int('Q>'.freeze, 8, size, offset)
79
+ end
80
+
81
+ def decode_int(type_code, type_size, size, offset)
82
+ return 0, offset if size == 0
83
+
84
+ buf = @io.read(offset, size)
85
+ buf = buf.rjust(type_size, "\x00".freeze) if size != type_size
86
+ [buf.unpack(type_code)[0], offset + size]
87
+ end
88
+
89
+ def decode_uint128(size, offset)
90
+ return 0, offset if size == 0
91
+
92
+ buf = @io.read(offset, size)
93
+
94
+ if size <= 8
95
+ buf = buf.rjust(8, "\x00".freeze)
96
+ return buf.unpack('Q>'.freeze)[0], offset + size
97
+ end
98
+
99
+ a_bytes = buf[0...-8].rjust(8, "\x00".freeze)
100
+ b_bytes = buf[-8...buf.length]
101
+ a = a_bytes.unpack('Q>'.freeze)[0]
102
+ b = b_bytes.unpack('Q>'.freeze)[0]
103
+ a <<= 64
104
+ [a | b, offset + size]
105
+ end
106
+
107
+ def decode_map(size, offset)
108
+ container = {}
109
+ size.times do
110
+ key, offset = decode(offset)
111
+ value, offset = decode(offset)
112
+ container[key] = value
113
+ end
114
+ [container, offset]
115
+ end
116
+
117
+ def decode_pointer(size, offset)
118
+ pointer_size = size >> 3
119
+
120
+ case pointer_size
121
+ when 0
122
+ new_offset = offset + 1
123
+ buf = (size & 0x7).chr << @io.read(offset, 1)
124
+ pointer = buf.unpack('n'.freeze)[0] + @pointer_base
125
+ when 1
126
+ new_offset = offset + 2
127
+ buf = "\x00".freeze.b << (size & 0x7).chr << @io.read(offset, 2)
128
+ pointer = buf.unpack('N'.freeze)[0] + 2048 + @pointer_base
129
+ when 2
130
+ new_offset = offset + 3
131
+ buf = (size & 0x7).chr << @io.read(offset, 3)
132
+ pointer = buf.unpack('N'.freeze)[0] + 526_336 + @pointer_base
133
+ else
134
+ new_offset = offset + 4
135
+ buf = @io.read(offset, 4)
136
+ pointer = buf.unpack('N'.freeze)[0] + @pointer_base
137
+ end
138
+
139
+ return pointer, new_offset if @pointer_test
140
+
141
+ value, = decode(pointer)
142
+ [value, new_offset]
143
+ end
144
+
145
+ def decode_utf8_string(size, offset)
146
+ new_offset = offset + size
147
+ buf = @io.read(offset, size)
148
+ buf.force_encoding(Encoding::UTF_8)
149
+ # We could check it's valid UTF-8 with `valid_encoding?', but for
150
+ # performance I do not.
151
+ [buf, new_offset]
152
+ end
153
+
154
+ TYPE_DECODER = {
155
+ 1 => :decode_pointer,
156
+ 2 => :decode_utf8_string,
157
+ 3 => :decode_double,
158
+ 4 => :decode_bytes,
159
+ 5 => :decode_uint16,
160
+ 6 => :decode_uint32,
161
+ 7 => :decode_map,
162
+ 8 => :decode_int32,
163
+ 9 => :decode_uint64,
164
+ 10 => :decode_uint128,
165
+ 11 => :decode_array,
166
+ 14 => :decode_boolean,
167
+ 15 => :decode_float,
168
+ }.freeze
169
+ private_constant :TYPE_DECODER
170
+
171
+ public
172
+
173
+ # Decode a section of the data section starting at +offset+.
174
+ #
175
+ # +offset+ is the location of the data structure to decode.
176
+ #
177
+ # Returns an array where the first element is the decoded value and the
178
+ # second is the offset after decoding it.
179
+ #
180
+ # Throws an exception if there is an error.
181
+ def decode(offset)
182
+ new_offset = offset + 1
183
+ buf = @io.read(offset, 1)
184
+ ctrl_byte = buf.ord
185
+ type_num = ctrl_byte >> 5
186
+ type_num, new_offset = read_extended(new_offset) if type_num == 0
187
+
188
+ size, new_offset = size_from_ctrl_byte(ctrl_byte, new_offset, type_num)
189
+ # We could check an element exists at `type_num', but for performance I
190
+ # don't.
191
+ send(TYPE_DECODER[type_num], size, new_offset)
192
+ end
193
+
194
+ private
195
+
196
+ def read_extended(offset)
197
+ buf = @io.read(offset, 1)
198
+ next_byte = buf.ord
199
+ type_num = next_byte + 7
200
+ if type_num < 7
201
+ raise InvalidDatabaseError,
202
+ "Something went horribly wrong in the decoder. An extended type resolved to a type number < 8 (#{type_num})"
203
+ end
204
+ [type_num, offset + 1]
205
+ end
206
+
207
+ def size_from_ctrl_byte(ctrl_byte, offset, type_num)
208
+ size = ctrl_byte & 0x1f
209
+
210
+ return size, offset if type_num == 1
211
+
212
+ return size, offset if size < 29
213
+
214
+ if size == 29
215
+ size_bytes = @io.read(offset, 1)
216
+ size = 29 + size_bytes.ord
217
+ return size, offset + 1
218
+ end
219
+
220
+ if size == 30
221
+ size_bytes = @io.read(offset, 2)
222
+ size = 285 + size_bytes.unpack('n'.freeze)[0]
223
+ return size, offset + 2
224
+ end
225
+
226
+ size_bytes = "\x00".freeze.b << @io.read(offset, 3)
227
+ size = 65_821 + size_bytes.unpack('N'.freeze)[0]
228
+ [size, offset + 3]
229
+ end
230
+ end
231
+ end
232
+ end
@@ -0,0 +1,8 @@
1
+ module MaxMind # :nodoc:
2
+ class DB
3
+ # An InvalidDatabaseError means the {MaxMind
4
+ # DB}[http://maxmind.github.io/MaxMind-DB/] file is corrupt or invalid.
5
+ class InvalidDatabaseError < RuntimeError
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,37 @@
1
+ require 'maxmind/db/errors'
2
+
3
+ module MaxMind # :nodoc:
4
+ class DB
5
+ class FileReader # :nodoc:
6
+ def initialize(filename)
7
+ @fh = File.new(filename, 'rb'.freeze)
8
+ @size = @fh.size
9
+ @mutex = Mutex.new
10
+ end
11
+
12
+ attr_reader :size
13
+
14
+ def close
15
+ @fh.close
16
+ end
17
+
18
+ def read(offset, size)
19
+ return ''.freeze.b if size == 0
20
+
21
+ # When we support only Ruby 2.5+, remove this and require pread.
22
+ if @fh.respond_to?(:pread)
23
+ buf = @fh.pread(size, offset)
24
+ else
25
+ @mutex.synchronize do
26
+ @fh.seek(offset, IO::SEEK_SET)
27
+ buf = @fh.read(size)
28
+ end
29
+ end
30
+
31
+ raise InvalidDatabaseError, 'The MaxMind DB file contains bad data'.freeze if buf.nil? || buf.length != size
32
+
33
+ buf
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,24 @@
1
+ module MaxMind # :nodoc:
2
+ class DB
3
+ class MemoryReader # :nodoc:
4
+ def initialize(filename, options = {})
5
+ if options[:is_buffer]
6
+ @buf = filename
7
+ @size = @buf.length
8
+ return
9
+ end
10
+
11
+ @buf = File.read(filename, mode: 'rb'.freeze).freeze
12
+ @size = @buf.length
13
+ end
14
+
15
+ attr_reader :size
16
+
17
+ def close; end
18
+
19
+ def read(offset, size)
20
+ @buf[offset, size]
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,61 @@
1
+ module MaxMind # :nodoc:
2
+ class DB
3
+ # Metadata holds metadata about a {MaxMind
4
+ # DB}[http://maxmind.github.io/MaxMind-DB/] file.
5
+ class Metadata
6
+ # The number of nodes in the database.
7
+ attr_reader :node_count
8
+
9
+ # The bit size of a record in the search tree.
10
+ attr_reader :record_size
11
+
12
+ # The IP version of the data in the database. A value of 4 means the
13
+ # database only supports IPv4. A database with a value of 6 may support
14
+ # both IPv4 and IPv6 lookups.
15
+ attr_reader :ip_version
16
+
17
+ # A string identifying the database type. e.g., "GeoIP2-City".
18
+ attr_reader :database_type
19
+
20
+ # An array of locale codes supported by the database.
21
+ attr_reader :languages
22
+
23
+ # The major version number of the binary format used when creating the
24
+ # database.
25
+ attr_reader :binary_format_major_version
26
+
27
+ # The minor version number of the binary format used when creating the
28
+ # database.
29
+ attr_reader :binary_format_minor_version
30
+
31
+ # The Unix epoch for the build time of the database.
32
+ attr_reader :build_epoch
33
+
34
+ # A hash from locales to text descriptions of the database.
35
+ attr_reader :description
36
+
37
+ # +m+ is a hash representing the metadata map.
38
+ def initialize(map)
39
+ @node_count = map['node_count']
40
+ @record_size = map['record_size']
41
+ @ip_version = map['ip_version']
42
+ @database_type = map['database_type']
43
+ @languages = map['languages']
44
+ @binary_format_major_version = map['binary_format_major_version']
45
+ @binary_format_minor_version = map['binary_format_minor_version']
46
+ @build_epoch = map['build_epoch']
47
+ @description = map['description']
48
+ end
49
+
50
+ # The size of a node in bytes.
51
+ def node_byte_size
52
+ @record_size / 4
53
+ end
54
+
55
+ # The size of the search tree in bytes.
56
+ def search_tree_size
57
+ @node_count * node_byte_size
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,19 @@
1
+ Gem::Specification.new do |s|
2
+ s.authors = ['William Storey']
3
+ s.files = Dir['**/*']
4
+ s.name = 'maxmind-db'
5
+ s.summary = 'A gem for reading MaxMind DB files.'
6
+ s.version = '1.0.0.beta'
7
+
8
+ s.description = 'A gem for reading MaxMind DB files. MaxMind DB is a binary file format that stores data indexed by IP address subnets (IPv4 or IPv6).'
9
+ s.email = 'wstorey@maxmind.com'
10
+ s.homepage = 'https://github.com/maxmind/MaxMind-DB-Reader-ruby'
11
+ s.licenses = ['Apache-2.0', 'MIT']
12
+ s.metadata = {
13
+ 'bug_tracker_uri' => 'https://github.com/maxmind/MaxMind-DB-Reader-ruby/issues',
14
+ 'changelog_uri' => 'https://github.com/maxmind/MaxMind-DB-Reader-ruby/blob/master/CHANGELOG.md',
15
+ 'documentation_uri' => 'https://github.com/maxmind/MaxMind-DB-Reader-ruby',
16
+ 'homepage_uri' => 'https://github.com/maxmind/MaxMind-DB-Reader-ruby',
17
+ 'source_code_uri' => 'https://github.com/maxmind/MaxMind-DB-Reader-ruby',
18
+ }
19
+ end
@@ -0,0 +1,4 @@
1
+ This work is licensed under the Creative Commons Attribution-ShareAlike 3.0
2
+ Unported License. To view a copy of this license, visit
3
+ http://creativecommons.org/licenses/by-sa/3.0/ or send a letter to Creative
4
+ Commons, 444 Castro Street, Suite 900, Mountain View, California, 94041, USA.
@@ -0,0 +1,558 @@
1
+ ---
2
+ layout: default
3
+ title: MaxMind DB File Format Specification
4
+ version: v2.0
5
+ ---
6
+ # MaxMind DB File Format Specification
7
+
8
+ ## Description
9
+
10
+ The MaxMind DB file format is a database format that maps IPv4 and IPv6
11
+ addresses to data records using an efficient binary search tree.
12
+
13
+ ## Version
14
+
15
+ This spec documents **version 2.0** of the MaxMind DB binary format.
16
+
17
+ The version number consists of separate major and minor version numbers. It
18
+ should not be considered a decimal number. In other words, version 2.10 comes
19
+ after version 2.9.
20
+
21
+ Code which is capable of reading a given major version of the format should
22
+ not be broken by minor version changes to the format.
23
+
24
+ ## Overview
25
+
26
+ The binary database is split into three parts:
27
+
28
+ 1. The binary search tree. Each level of the tree corresponds to a single bit
29
+ in the 128 bit representation of an IPv6 address.
30
+ 2. The data section. These are the values returned to the client for a
31
+ specific IP address, e.g. "US", "New York", or a more complex map type made up
32
+ of multiple fields.
33
+ 3. Database metadata. Information about the database itself.
34
+
35
+ ## Database Metadata
36
+
37
+ This portion of the database is stored at the end of the file. It is
38
+ documented first because understanding some of the metadata is key to
39
+ understanding how the other sections work.
40
+
41
+ This section can be found by looking for a binary sequence matching
42
+ "\xab\xcd\xefMaxMind.com". The *last* occurrence of this string in the file
43
+ marks the end of the data section and the beginning of the metadata. Since we
44
+ allow for arbitrary binary data in the data section, some other piece of data
45
+ could contain these values. This is why you need to find the last occurrence
46
+ of this sequence.
47
+
48
+ The maximum allowable size for the metadata section, including the marker that
49
+ starts the metadata, is 128KiB.
50
+
51
+ The metadata is stored as a map data structure. This structure is described
52
+ later in the spec. Changing a key's data type or removing a key would
53
+ constitute a major version change for this spec.
54
+
55
+ Except where otherwise specified, each key listed is required for the database
56
+ to be considered valid.
57
+
58
+ Adding a key constitutes a minor version change. Removing a key or changing
59
+ its type constitutes a major version change.
60
+
61
+ The list of known keys for the current version of the format is as follows:
62
+
63
+ ### node\_count
64
+
65
+ This is an unsigned 32-bit integer indicating the number of nodes in the
66
+ search tree.
67
+
68
+ ### record\_size
69
+
70
+ This is an unsigned 16-bit integer. It indicates the number of bits in a
71
+ record in the search tree. Note that each node consists of *two* records.
72
+
73
+ ### ip\_version
74
+
75
+ This is an unsigned 16-bit integer which is always 4 or 6. It indicates
76
+ whether the database contains IPv4 or IPv6 address data.
77
+
78
+ ### database\_type
79
+
80
+ This is a string that indicates the structure of each data record associated
81
+ with an IP address. The actual definition of these structures is left up to
82
+ the database creator.
83
+
84
+ Names starting with "GeoIP" are reserved for use by MaxMind (and "GeoIP" is a
85
+ trademark anyway).
86
+
87
+ ### languages
88
+
89
+ An array of strings, each of which is a locale code. A given record may
90
+ contain data items that have been localized to some or all of these
91
+ locales. Records should not contain localized data for locales not included in
92
+ this array.
93
+
94
+ This is an optional key, as this may not be relevant for all types of data.
95
+
96
+ ### binary\_format\_major\_version
97
+
98
+ This is an unsigned 16-bit integer indicating the major version number for the
99
+ database's binary format.
100
+
101
+ ### binary\_format\_minor\_version
102
+
103
+ This is an unsigned 16-bit integer indicating the minor version number for the
104
+ database's binary format.
105
+
106
+ ### build\_epoch
107
+
108
+ This is an unsigned 64-bit integer that contains the database build timestamp
109
+ as a Unix epoch value.
110
+
111
+ ### description
112
+
113
+ This key will always point to a map. The keys of that map will be language
114
+ codes, and the values will be a description in that language as a UTF-8
115
+ string.
116
+
117
+ The codes may include additional information such as script or country
118
+ identifiers, like "zh-TW" or "mn-Cyrl-MN". The additional identifiers will be
119
+ separated by a dash character ("-").
120
+
121
+ This key is optional. However, creators of databases are strongly
122
+ encouraged to include a description in at least one language.
123
+
124
+ ### Calculating the Search Tree Section Size
125
+
126
+ The formula for calculating the search tree section size *in bytes* is as
127
+ follows:
128
+
129
+ ( ( $record_size * 2 ) / 8 ) * $number_of_nodes
130
+
131
+ The end of the search tree marks the beginning of the data section.
132
+
133
+ ## Binary Search Tree Section
134
+
135
+ The database file starts with a binary search tree. The number of nodes in the
136
+ tree is dependent on how many unique netblocks are needed for the particular
137
+ database. For example, the city database needs many more small netblocks than
138
+ the country database.
139
+
140
+ The top most node is always located at the beginning of the search tree
141
+ section's address space. The top node is node 0.
142
+
143
+ Each node consists of two records, each of which is a pointer to an address in
144
+ the file.
145
+
146
+ The pointers can point to one of three things. First, it may point to another
147
+ node in the search tree address space. These pointers are followed as part of
148
+ the IP address search algorithm, described below.
149
+
150
+ The pointer can point to a value equal to `$number_of_nodes`. If this is the
151
+ case, it means that the IP address we are searching for is not in the
152
+ database.
153
+
154
+ Finally, it may point to an address in the data section. This is the data
155
+ relevant to the given netblock.
156
+
157
+ ### Node Layout
158
+
159
+ Each node in the search tree consists of two records, each of which is a
160
+ pointer. The record size varies by database, but inside a single database node
161
+ records are always the same size. A record may be anywhere from 24 to 128 bits
162
+ long, depending on the number of nodes in the tree. These pointers are
163
+ stored in big-endian format (most significant byte first).
164
+
165
+ Here are some examples of how the records are laid out in a node for 24, 28,
166
+ and 32 bit records. Larger record sizes follow this same pattern.
167
+
168
+ #### 24 bits (small database), one node is 6 bytes
169
+
170
+ | <------------- node --------------->|
171
+ | 23 .. 0 | 23 .. 0 |
172
+
173
+ #### 28 bits (medium database), one node is 7 bytes
174
+
175
+ | <------------- node --------------->|
176
+ | 23 .. 0 | 27..24 | 27..24 | 23 .. 0 |
177
+
178
+ Note, the last 4 bits of each pointer are combined into the middle byte.
179
+
180
+ #### 32 bits (large database), one node is 8 bytes
181
+
182
+ | <------------- node --------------->|
183
+ | 31 .. 0 | 31 .. 0 |
184
+
185
+ ### Search Lookup Algorithm
186
+
187
+ The first step is to convert the IP address to its big-endian binary
188
+ representation. For an IPv4 address, this becomes 32 bits. For IPv6 you get
189
+ 128 bits.
190
+
191
+ The leftmost bit corresponds to the first node in the search tree. For each
192
+ bit, a value of 0 means we choose the left record in a node, and a value of 1
193
+ means we choose the right record.
194
+
195
+ The record value is always interpreted as an unsigned integer. The maximum
196
+ size of the integer is dependent on the number of bits in a record (24, 28, or
197
+ 32).
198
+
199
+ If the record value is a number that is less than the *number of nodes* (not
200
+ in bytes, but the actual node count) in the search tree (this is stored in the
201
+ database metadata), then the value is a node number. In this case, we find
202
+ that node in the search tree and repeat the lookup algorithm from there.
203
+
204
+ If the record value is equal to the number of nodes, that means that we do not
205
+ have any data for the IP address, and the search ends here.
206
+
207
+ If the record value is *greater* than the number of nodes in the search tree,
208
+ then it is an actual pointer value pointing into the data section. The value
209
+ of the pointer is calculated from the start of the data section, *not* from
210
+ the start of the file.
211
+
212
+ In order to determine where in the data section we should start looking, we use
213
+ the following formula:
214
+
215
+ $data_section_offset = ( $record_value - $node_count ) - 16
216
+
217
+ The `16` is the size of the data section separator (see below for details).
218
+
219
+ The reason that we subtract the `$node_count` is best demonstrated by an example.
220
+
221
+ Let's assume we have a 24-bit tree with 1,000 nodes. Each node contains 48
222
+ bits, or 6 bytes. The size of the tree is 6,000 bytes.
223
+
224
+ When a record in the tree contains a number that is less than 1,000, this
225
+ is a *node number*, and we look up that node. If a record contains a value
226
+ greater than or equal to 1,016, we know that it is a data section value. We
227
+ subtract the node count (1,000) and then subtract 16 for the data section
228
+ separator, giving us the number 0, the first byte of the data section.
229
+
230
+ If a record contained the value 6,000, this formula would give us an offset of
231
+ 4,984 into the data section.
232
+
233
+ In order to determine where in the file this offset really points to, we also
234
+ need to know where the data section starts. This can be calculated by
235
+ determining the size of the search tree in bytes and then adding an additional
236
+ 16 bytes for the data section separator.
237
+
238
+ So the final formula to determine the offset in the file is:
239
+
240
+ $offset_in_file = ( $record_value - $node_count )
241
+ + $search_tree_size_in_bytes
242
+
243
+ ### IPv4 addresses in an IPv6 tree
244
+
245
+ When storing IPv4 addresses in an IPv6 tree, they are stored as-is, so they
246
+ occupy the first 32-bits of the address space (from 0 to 2**32 - 1).
247
+
248
+ Creators of databases should decide on a strategy for handling the various
249
+ mappings between IPv4 and IPv6.
250
+
251
+ The strategy that MaxMind uses for its GeoIP databases is to include a pointer
252
+ from the `::ffff:0:0/96` subnet to the root node of the IPv4 address space in
253
+ the tree. This accounts for the
254
+ [IPv4-mapped IPv6 address](http://en.wikipedia.org/wiki/IPv6#IPv4-mapped_IPv6_addresses).
255
+
256
+ MaxMind also includes a pointer from the `2002::/16` subnet to the root node
257
+ of the IPv4 address space in the tree. This accounts for the
258
+ [6to4 mapping](http://en.wikipedia.org/wiki/6to4) subnet.
259
+
260
+ Database creators are encouraged to document whether they are doing something
261
+ similar for their databases.
262
+
263
+ The Teredo subnet cannot be accounted for in the tree. Instead, code that
264
+ searches the tree can offer to decode the IPv4 portion of a Teredo address and
265
+ look that up.
266
+
267
+ ## Data Section Separator
268
+
269
+ There are 16 bytes of NULLs in between the search tree and the data
270
+ section. This separator exists in order to make it possible for a verification
271
+ tool to distinguish between the two sections.
272
+
273
+ This separator is not considered part of the data section itself. In other
274
+ words, the data section starts at `$size_of_search_tree + 16` bytes in the
275
+ file.
276
+
277
+ ## Output Data Section
278
+
279
+ Each output data field has an associated type, and that type is encoded as a
280
+ number that begins the data field. Some types are variable length. In those
281
+ cases, the type indicator is also followed by a length. The data payload
282
+ always comes at the end of the field.
283
+
284
+ All binary data is stored in big-endian format.
285
+
286
+ Note that the *interpretation* of a given data type's meaning is decided by
287
+ higher-level APIs, not by the binary format itself.
288
+
289
+ ### pointer - 1
290
+
291
+ A pointer to another part of the data section's address space. The pointer
292
+ will point to the beginning of a field. It is illegal for a pointer to point
293
+ to another pointer.
294
+
295
+ Pointer values start from the beginning of the data section, *not* the
296
+ beginning of the file.
297
+
298
+ ### UTF-8 string - 2
299
+
300
+ A variable length byte sequence that contains valid utf8. If the length is
301
+ zero then this is an empty string.
302
+
303
+ ### double - 3
304
+
305
+ This is stored as an IEEE-754 double (binary64) in big-endian format. The
306
+ length of a double is always 8 bytes.
307
+
308
+ ### bytes - 4
309
+
310
+ A variable length byte sequence containing any sort of binary data. If the
311
+ length is zero then this a zero-length byte sequence.
312
+
313
+ This is not currently used but may be used in the future to embed non-text
314
+ data (images, etc.).
315
+
316
+ ### integer formats
317
+
318
+ Integers are stored in variable length binary fields.
319
+
320
+ We support 16-bit, 32-bit, 64-bit, and 128-bit unsigned integers. We also
321
+ support 32-bit signed integers.
322
+
323
+ A 128-bit integer can use up to 16 bytes, but may use fewer. Similarly, a
324
+ 32-bit integer may use from 0-4 bytes. The number of bytes used is determined
325
+ by the length specifier in the control byte. See below for details.
326
+
327
+ A length of zero always indicates the number 0.
328
+
329
+ When storing a signed integer, the left-most bit is the sign. A 1 is negative
330
+ and a 0 is positive.
331
+
332
+ The type numbers for our integer types are:
333
+
334
+ * unsigned 16-bit int - 5
335
+ * unsigned 32-bit int - 6
336
+ * signed 32-bit int - 8
337
+ * unsigned 64-bit int - 9
338
+ * unsigned 128-bit int - 10
339
+
340
+ The unsigned 32-bit and 128-bit types may be used to store IPv4 and IPv6
341
+ addresses, respectively.
342
+
343
+ The signed 32-bit integers are stored using the 2's complement representation.
344
+
345
+ ### map - 7
346
+
347
+ A map data type contains a set of key/value pairs. Unlike other data types,
348
+ the length information for maps indicates how many key/value pairs it
349
+ contains, not its length in bytes. This size can be zero.
350
+
351
+ See below for the algorithm used to determine the number of pairs in the
352
+ hash. This algorithm is also used to determine the length of a field's
353
+ payload.
354
+
355
+ ### array - 11
356
+
357
+ An array type contains a set of ordered values. The length information for
358
+ arrays indicates how many values it contains, not its length in bytes. This
359
+ size can be zero.
360
+
361
+ This type uses the same algorithm as maps for determining the length of a
362
+ field's payload.
363
+
364
+ ### data cache container - 12
365
+
366
+ This is a special data type that marks a container used to cache repeated
367
+ data. For example, instead of repeating the string "United States" over and
368
+ over in the database, we store it in the cache container and use pointers
369
+ *into* this container instead.
370
+
371
+ Nothing in the database will ever contain a pointer to this field
372
+ itself. Instead, various fields will point into the container.
373
+
374
+ The primary reason for making this a separate data type versus simply inlining
375
+ the cached data is so that a database dumper tool can skip this cache when
376
+ dumping the data section. The cache contents will end up being dumped as
377
+ pointers into it are followed.
378
+
379
+ ### end marker - 13
380
+
381
+ The end marker marks the end of the data section. It is not strictly
382
+ necessary, but including this marker allows a data section deserializer to
383
+ process a stream of input, rather than having to find the end of the section
384
+ before beginning the deserialization.
385
+
386
+ This data type is not followed by a payload, and its size is always zero.
387
+
388
+ ### boolean - 14
389
+
390
+ A true or false value. The length information for a boolean type will always
391
+ be 0 or 1, indicating the value. There is no payload for this field.
392
+
393
+ ### float - 15
394
+
395
+ This is stored as an IEEE-754 float (binary32) in big-endian format. The
396
+ length of a float is always 4 bytes.
397
+
398
+ This type is provided primarily for completeness. Because of the way floating
399
+ point numbers are stored, this type can easily lose precision when serialized
400
+ and then deserialized. If this is an issue for you, consider using a double
401
+ instead.
402
+
403
+ ### Data Field Format
404
+
405
+ Each field starts with a control byte. This control byte provides information
406
+ about the field's data type and payload size.
407
+
408
+ The first three bits of the control byte tell you what type the field is. If
409
+ these bits are all 0, then this is an "extended" type, which means that the
410
+ *next* byte contains the actual type. Otherwise, the first three bits will
411
+ contain a number from 1 to 7, the actual type for the field.
412
+
413
+ We've tried to assign the most commonly used types as numbers 1-7 as an
414
+ optimization.
415
+
416
+ With an extended type, the type number in the second byte is the number
417
+ minus 7. In other words, an array (type 11) will be stored with a 0 for the
418
+ type in the first byte and a 4 in the second.
419
+
420
+ Here is an example of how the control byte may combine with the next byte to
421
+ tell us the type:
422
+
423
+ 001XXXXX pointer
424
+ 010XXXXX UTF-8 string
425
+ 110XXXXX unsigned 32-bit int (ASCII)
426
+ 000XXXXX 00000011 unsigned 128-bit int (binary)
427
+ 000XXXXX 00000100 array
428
+ 000XXXXX 00000110 end marker
429
+
430
+ #### Payload Size
431
+
432
+ The next five bits in the control byte tell you how long the data field's
433
+ payload is, except for maps and pointers. Maps and pointers use this size
434
+ information a bit differently. See below.
435
+
436
+ If the five bits are smaller than 29, then those bits are the payload size in
437
+ bytes. For example:
438
+
439
+ 01000010 UTF-8 string - 2 bytes long
440
+ 01011100 UTF-8 string - 28 bytes long
441
+ 11000001 unsigned 32-bit int - 1 byte long
442
+ 00000011 00000011 unsigned 128-bit int - 3 bytes long
443
+
444
+ If the five bits are equal to 29, 30, or 31, then use the following algorithm
445
+ to calculate the payload size.
446
+
447
+ If the value is 29, then the size is 29 + *the next byte after the type
448
+ specifying bytes as an unsigned integer*.
449
+
450
+ If the value is 30, then the size is 285 + *the next two bytes after the type
451
+ specifying bytes as a single unsigned integer*.
452
+
453
+ If the value is 31, then the size is 65,821 + *the next three bytes after the
454
+ type specifying bytes as a single unsigned integer*.
455
+
456
+ Some examples:
457
+
458
+ 01011101 00110011 UTF-8 string - 80 bytes long
459
+
460
+ In this case, the last five bits of the control byte equal 29. We treat the
461
+ next byte as an unsigned integer. The next byte is 51, so the total size is
462
+ (29 + 51) = 80.
463
+
464
+ 01011110 00110011 00110011 UTF-8 string - 13,392 bytes long
465
+
466
+ The last five bits of the control byte equal 30. We treat the next two bytes
467
+ as a single unsigned integer. The next two bytes equal 13,107, so the total
468
+ size is (285 + 13,107) = 13,392.
469
+
470
+ 01011111 00110011 00110011 00110011 UTF-8 string - 3,421,264 bytes long
471
+
472
+ The last five bits of the control byte equal 31. We treat the next three bytes
473
+ as a single unsigned integer. The next three bytes equal 3,355,443, so the
474
+ total size is (65,821 + 3,355,443) = 3,421,264.
475
+
476
+ This means that the maximum payload size for a single field is 16,843,036
477
+ bytes.
478
+
479
+ The binary number types always have a known size, but for consistency's sake,
480
+ the control byte will always specify the correct size for these types.
481
+
482
+ #### Maps
483
+
484
+ Maps use the size in the control byte (and any following bytes) to indicate
485
+ the number of key/value pairs in the map, not the size of the payload in
486
+ bytes.
487
+
488
+ This means that the maximum number of pairs for a single map is 16,843,036.
489
+
490
+ Maps are laid out with each key followed by its value, followed by the next
491
+ pair, etc.
492
+
493
+ The keys are **always** UTF-8 strings. The values may be any data type,
494
+ including maps or pointers.
495
+
496
+ Once we know the number of pairs, we can look at each pair in turn to
497
+ determine the size of the key and the key name, as well as the value's type
498
+ and payload.
499
+
500
+ #### Pointers
501
+
502
+ Pointers use the last five bits in the control byte to calculate the pointer
503
+ value.
504
+
505
+ To calculate the pointer value, we start by subdividing the five bits into two
506
+ groups. The first two bits indicate the size, and the next three bits are part
507
+ of the value, so we end up with a control byte breaking down like this:
508
+ 001SSVVV.
509
+
510
+ The size can be 0, 1, 2, or 3.
511
+
512
+ If the size is 0, the pointer is built by appending the next byte to the last
513
+ three bits to produce an 11-bit value.
514
+
515
+ If the size is 1, the pointer is built by appending the next two bytes to the
516
+ last three bits to produce a 19-bit value + 2048.
517
+
518
+ If the size is 2, the pointer is built by appending the next three bytes to the
519
+ last three bits to produce a 27-bit value + 526336.
520
+
521
+ Finally, if the size is 3, the pointer's value is contained in the next four
522
+ bytes as a 32-bit value. In this case, the last three bits of the control byte
523
+ are ignored.
524
+
525
+ This means that we are limited to 4GB of address space for pointers, so the
526
+ data section size for the database is limited to 4GB.
527
+
528
+ ## Reference Implementations
529
+
530
+ ### Writer
531
+
532
+ * [Perl](https://github.com/maxmind/MaxMind-DB-Writer-perl)
533
+
534
+ ### Reader
535
+
536
+ * [C](https://github.com/maxmind/libmaxminddb)
537
+ * [C#](https://github.com/maxmind/MaxMind-DB-Reader-dotnet)
538
+ * [Java](https://github.com/maxmind/MaxMind-DB-Reader-java)
539
+ * [Perl](https://github.com/maxmind/MaxMind-DB-Reader-perl)
540
+ * [PHP](https://github.com/maxmind/MaxMind-DB-Reader-php)
541
+ * [Python](https://github.com/maxmind/MaxMind-DB-Reader-python)
542
+
543
+ ## Authors
544
+
545
+ This specification was created by the following authors:
546
+
547
+ * Greg Oschwald \<goschwald@maxmind.com\>
548
+ * Dave Rolsky \<drolsky@maxmind.com\>
549
+ * Boris Zentner \<bzentner@maxmind.com\>
550
+
551
+ ## License
552
+
553
+ This work is licensed under the Creative Commons Attribution-ShareAlike 3.0
554
+ Unported License. To view a copy of this license, visit
555
+ [http://creativecommons.org/licenses/by-sa/3.0/](http://creativecommons.org/licenses/by-sa/3.0/)
556
+ or send a letter to Creative Commons, 444 Castro Street, Suite 900, Mountain
557
+ View, California, 94041, USA
558
+