maxmind-db 1.0.0.beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +4 -0
  3. data/LICENSE-APACHE +202 -0
  4. data/LICENSE-MIT +17 -0
  5. data/README.dev.md +30 -0
  6. data/README.md +54 -0
  7. data/Rakefile +12 -0
  8. data/bin/mmdb-benchmark.rb +61 -0
  9. data/lib/maxmind/db.rb +274 -0
  10. data/lib/maxmind/db/decoder.rb +232 -0
  11. data/lib/maxmind/db/errors.rb +8 -0
  12. data/lib/maxmind/db/file_reader.rb +37 -0
  13. data/lib/maxmind/db/memory_reader.rb +24 -0
  14. data/lib/maxmind/db/metadata.rb +61 -0
  15. data/maxmind-db.gemspec +19 -0
  16. data/test/data/LICENSE +4 -0
  17. data/test/data/MaxMind-DB-spec.md +558 -0
  18. data/test/data/MaxMind-DB-test-metadata-pointers.mmdb +0 -0
  19. data/test/data/README.md +4 -0
  20. data/test/data/bad-data/README.md +7 -0
  21. data/test/data/bad-data/libmaxminddb/libmaxminddb-offset-integer-overflow.mmdb +0 -0
  22. data/test/data/bad-data/maxminddb-golang/cyclic-data-structure.mmdb +0 -0
  23. data/test/data/bad-data/maxminddb-golang/invalid-bytes-length.mmdb +1 -0
  24. data/test/data/bad-data/maxminddb-golang/invalid-data-record-offset.mmdb +0 -0
  25. data/test/data/bad-data/maxminddb-golang/invalid-map-key-length.mmdb +0 -0
  26. data/test/data/bad-data/maxminddb-golang/invalid-string-length.mmdb +1 -0
  27. data/test/data/bad-data/maxminddb-golang/metadata-is-an-uint128.mmdb +1 -0
  28. data/test/data/bad-data/maxminddb-golang/unexpected-bytes.mmdb +0 -0
  29. data/test/data/perltidyrc +12 -0
  30. data/test/data/source-data/GeoIP2-Anonymous-IP-Test.json +41 -0
  31. data/test/data/source-data/GeoIP2-City-Test.json +12852 -0
  32. data/test/data/source-data/GeoIP2-Connection-Type-Test.json +102 -0
  33. data/test/data/source-data/GeoIP2-Country-Test.json +11347 -0
  34. data/test/data/source-data/GeoIP2-DensityIncome-Test.json +14 -0
  35. data/test/data/source-data/GeoIP2-Domain-Test.json +452 -0
  36. data/test/data/source-data/GeoIP2-Enterprise-Test.json +673 -0
  37. data/test/data/source-data/GeoIP2-ISP-Test.json +12585 -0
  38. data/test/data/source-data/GeoIP2-Precision-Enterprise-Test.json +1598 -0
  39. data/test/data/source-data/GeoIP2-User-Count-Test.json +2824 -0
  40. data/test/data/source-data/GeoLite2-ASN-Test.json +37 -0
  41. data/test/data/source-data/README +15 -0
  42. data/test/data/test-data/GeoIP2-Anonymous-IP-Test.mmdb +0 -0
  43. data/test/data/test-data/GeoIP2-City-Test-Broken-Double-Format.mmdb +0 -0
  44. data/test/data/test-data/GeoIP2-City-Test-Invalid-Node-Count.mmdb +0 -0
  45. data/test/data/test-data/GeoIP2-City-Test.mmdb +0 -0
  46. data/test/data/test-data/GeoIP2-Connection-Type-Test.mmdb +0 -0
  47. data/test/data/test-data/GeoIP2-Country-Test.mmdb +0 -0
  48. data/test/data/test-data/GeoIP2-DensityIncome-Test.mmdb +0 -0
  49. data/test/data/test-data/GeoIP2-Domain-Test.mmdb +0 -0
  50. data/test/data/test-data/GeoIP2-Enterprise-Test.mmdb +0 -0
  51. data/test/data/test-data/GeoIP2-ISP-Test.mmdb +0 -0
  52. data/test/data/test-data/GeoIP2-Precision-Enterprise-Test.mmdb +0 -0
  53. data/test/data/test-data/GeoIP2-User-Count-Test.mmdb +0 -0
  54. data/test/data/test-data/GeoLite2-ASN-Test.mmdb +0 -0
  55. data/test/data/test-data/MaxMind-DB-no-ipv4-search-tree.mmdb +0 -0
  56. data/test/data/test-data/MaxMind-DB-string-value-entries.mmdb +0 -0
  57. data/test/data/test-data/MaxMind-DB-test-broken-pointers-24.mmdb +0 -0
  58. data/test/data/test-data/MaxMind-DB-test-broken-search-tree-24.mmdb +0 -0
  59. data/test/data/test-data/MaxMind-DB-test-decoder.mmdb +0 -0
  60. data/test/data/test-data/MaxMind-DB-test-ipv4-24.mmdb +0 -0
  61. data/test/data/test-data/MaxMind-DB-test-ipv4-28.mmdb +0 -0
  62. data/test/data/test-data/MaxMind-DB-test-ipv4-32.mmdb +0 -0
  63. data/test/data/test-data/MaxMind-DB-test-ipv6-24.mmdb +0 -0
  64. data/test/data/test-data/MaxMind-DB-test-ipv6-28.mmdb +0 -0
  65. data/test/data/test-data/MaxMind-DB-test-ipv6-32.mmdb +0 -0
  66. data/test/data/test-data/MaxMind-DB-test-metadata-pointers.mmdb +0 -0
  67. data/test/data/test-data/MaxMind-DB-test-mixed-24.mmdb +0 -0
  68. data/test/data/test-data/MaxMind-DB-test-mixed-28.mmdb +0 -0
  69. data/test/data/test-data/MaxMind-DB-test-mixed-32.mmdb +0 -0
  70. data/test/data/test-data/MaxMind-DB-test-nested.mmdb +0 -0
  71. data/test/data/test-data/README.md +26 -0
  72. data/test/data/test-data/maps-with-pointers.raw +0 -0
  73. data/test/data/test-data/write-test-data.pl +620 -0
  74. data/test/data/tidyall.ini +5 -0
  75. data/test/mmdb_util.rb +24 -0
  76. data/test/test_decoder.rb +241 -0
  77. data/test/test_reader.rb +415 -0
  78. metadata +126 -0
@@ -0,0 +1,232 @@
1
+ require 'maxmind/db/errors'
2
+
3
+ module MaxMind # :nodoc:
4
+ class DB
5
+ # +Decoder+ decodes a {MaxMind DB}[http://maxmind.github.io/MaxMind-DB/]
6
+ # data section.
7
+ #
8
+ # Typically you will interact with this class through a Reader rather than
9
+ # directly.
10
+ class Decoder # :nodoc:
11
+ # Create a +Decoder+.
12
+ #
13
+ # +io+ is the DB. It must provide a +read+ method. It must be opened in
14
+ # binary mode.
15
+ #
16
+ # +pointer_base+ is the base number to use when decoding a pointer. It is
17
+ # where the data section begins rather than the beginning of the file.
18
+ # The specification states the formula in the `Data Section Separator'
19
+ # section.
20
+ #
21
+ # +pointer_test+ is used for testing pointer code.
22
+ def initialize(io, pointer_base = 0, pointer_test = false)
23
+ @io = io
24
+ @pointer_base = pointer_base
25
+ @pointer_test = pointer_test
26
+ end
27
+
28
+ private
29
+
30
+ def decode_array(size, offset)
31
+ array = []
32
+ size.times do
33
+ value, offset = decode(offset)
34
+ array << value
35
+ end
36
+ [array, offset]
37
+ end
38
+
39
+ def decode_boolean(size, offset)
40
+ [size != 0, offset]
41
+ end
42
+
43
+ def decode_bytes(size, offset)
44
+ [@io.read(offset, size), offset + size]
45
+ end
46
+
47
+ def decode_double(size, offset)
48
+ verify_size(8, size)
49
+ buf = @io.read(offset, 8)
50
+ [buf.unpack('G'.freeze)[0], offset + 8]
51
+ end
52
+
53
+ def decode_float(size, offset)
54
+ verify_size(4, size)
55
+ buf = @io.read(offset, 4)
56
+ [buf.unpack('g'.freeze)[0], offset + 4]
57
+ end
58
+
59
+ def verify_size(expected, actual)
60
+ return if expected == actual
61
+ raise InvalidDatabaseError,
62
+ 'The MaxMind DB file\'s data section contains bad data (unknown data type or corrupt data)'.freeze
63
+ end
64
+
65
+ def decode_int32(size, offset)
66
+ decode_int('l>'.freeze, 4, size, offset)
67
+ end
68
+
69
+ def decode_uint16(size, offset)
70
+ decode_int('n'.freeze, 2, size, offset)
71
+ end
72
+
73
+ def decode_uint32(size, offset)
74
+ decode_int('N'.freeze, 4, size, offset)
75
+ end
76
+
77
+ def decode_uint64(size, offset)
78
+ decode_int('Q>'.freeze, 8, size, offset)
79
+ end
80
+
81
+ def decode_int(type_code, type_size, size, offset)
82
+ return 0, offset if size == 0
83
+
84
+ buf = @io.read(offset, size)
85
+ buf = buf.rjust(type_size, "\x00".freeze) if size != type_size
86
+ [buf.unpack(type_code)[0], offset + size]
87
+ end
88
+
89
+ def decode_uint128(size, offset)
90
+ return 0, offset if size == 0
91
+
92
+ buf = @io.read(offset, size)
93
+
94
+ if size <= 8
95
+ buf = buf.rjust(8, "\x00".freeze)
96
+ return buf.unpack('Q>'.freeze)[0], offset + size
97
+ end
98
+
99
+ a_bytes = buf[0...-8].rjust(8, "\x00".freeze)
100
+ b_bytes = buf[-8...buf.length]
101
+ a = a_bytes.unpack('Q>'.freeze)[0]
102
+ b = b_bytes.unpack('Q>'.freeze)[0]
103
+ a <<= 64
104
+ [a | b, offset + size]
105
+ end
106
+
107
+ def decode_map(size, offset)
108
+ container = {}
109
+ size.times do
110
+ key, offset = decode(offset)
111
+ value, offset = decode(offset)
112
+ container[key] = value
113
+ end
114
+ [container, offset]
115
+ end
116
+
117
+ def decode_pointer(size, offset)
118
+ pointer_size = size >> 3
119
+
120
+ case pointer_size
121
+ when 0
122
+ new_offset = offset + 1
123
+ buf = (size & 0x7).chr << @io.read(offset, 1)
124
+ pointer = buf.unpack('n'.freeze)[0] + @pointer_base
125
+ when 1
126
+ new_offset = offset + 2
127
+ buf = "\x00".freeze.b << (size & 0x7).chr << @io.read(offset, 2)
128
+ pointer = buf.unpack('N'.freeze)[0] + 2048 + @pointer_base
129
+ when 2
130
+ new_offset = offset + 3
131
+ buf = (size & 0x7).chr << @io.read(offset, 3)
132
+ pointer = buf.unpack('N'.freeze)[0] + 526_336 + @pointer_base
133
+ else
134
+ new_offset = offset + 4
135
+ buf = @io.read(offset, 4)
136
+ pointer = buf.unpack('N'.freeze)[0] + @pointer_base
137
+ end
138
+
139
+ return pointer, new_offset if @pointer_test
140
+
141
+ value, = decode(pointer)
142
+ [value, new_offset]
143
+ end
144
+
145
+ def decode_utf8_string(size, offset)
146
+ new_offset = offset + size
147
+ buf = @io.read(offset, size)
148
+ buf.force_encoding(Encoding::UTF_8)
149
+ # We could check it's valid UTF-8 with `valid_encoding?', but for
150
+ # performance I do not.
151
+ [buf, new_offset]
152
+ end
153
+
154
+ TYPE_DECODER = {
155
+ 1 => :decode_pointer,
156
+ 2 => :decode_utf8_string,
157
+ 3 => :decode_double,
158
+ 4 => :decode_bytes,
159
+ 5 => :decode_uint16,
160
+ 6 => :decode_uint32,
161
+ 7 => :decode_map,
162
+ 8 => :decode_int32,
163
+ 9 => :decode_uint64,
164
+ 10 => :decode_uint128,
165
+ 11 => :decode_array,
166
+ 14 => :decode_boolean,
167
+ 15 => :decode_float,
168
+ }.freeze
169
+ private_constant :TYPE_DECODER
170
+
171
+ public
172
+
173
+ # Decode a section of the data section starting at +offset+.
174
+ #
175
+ # +offset+ is the location of the data structure to decode.
176
+ #
177
+ # Returns an array where the first element is the decoded value and the
178
+ # second is the offset after decoding it.
179
+ #
180
+ # Throws an exception if there is an error.
181
+ def decode(offset)
182
+ new_offset = offset + 1
183
+ buf = @io.read(offset, 1)
184
+ ctrl_byte = buf.ord
185
+ type_num = ctrl_byte >> 5
186
+ type_num, new_offset = read_extended(new_offset) if type_num == 0
187
+
188
+ size, new_offset = size_from_ctrl_byte(ctrl_byte, new_offset, type_num)
189
+ # We could check an element exists at `type_num', but for performance I
190
+ # don't.
191
+ send(TYPE_DECODER[type_num], size, new_offset)
192
+ end
193
+
194
+ private
195
+
196
+ def read_extended(offset)
197
+ buf = @io.read(offset, 1)
198
+ next_byte = buf.ord
199
+ type_num = next_byte + 7
200
+ if type_num < 7
201
+ raise InvalidDatabaseError,
202
+ "Something went horribly wrong in the decoder. An extended type resolved to a type number < 8 (#{type_num})"
203
+ end
204
+ [type_num, offset + 1]
205
+ end
206
+
207
+ def size_from_ctrl_byte(ctrl_byte, offset, type_num)
208
+ size = ctrl_byte & 0x1f
209
+
210
+ return size, offset if type_num == 1
211
+
212
+ return size, offset if size < 29
213
+
214
+ if size == 29
215
+ size_bytes = @io.read(offset, 1)
216
+ size = 29 + size_bytes.ord
217
+ return size, offset + 1
218
+ end
219
+
220
+ if size == 30
221
+ size_bytes = @io.read(offset, 2)
222
+ size = 285 + size_bytes.unpack('n'.freeze)[0]
223
+ return size, offset + 2
224
+ end
225
+
226
+ size_bytes = "\x00".freeze.b << @io.read(offset, 3)
227
+ size = 65_821 + size_bytes.unpack('N'.freeze)[0]
228
+ [size, offset + 3]
229
+ end
230
+ end
231
+ end
232
+ end
@@ -0,0 +1,8 @@
1
+ module MaxMind # :nodoc:
2
+ class DB
3
+ # An InvalidDatabaseError means the {MaxMind
4
+ # DB}[http://maxmind.github.io/MaxMind-DB/] file is corrupt or invalid.
5
+ class InvalidDatabaseError < RuntimeError
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,37 @@
1
+ require 'maxmind/db/errors'
2
+
3
+ module MaxMind # :nodoc:
4
+ class DB
5
+ class FileReader # :nodoc:
6
+ def initialize(filename)
7
+ @fh = File.new(filename, 'rb'.freeze)
8
+ @size = @fh.size
9
+ @mutex = Mutex.new
10
+ end
11
+
12
+ attr_reader :size
13
+
14
+ def close
15
+ @fh.close
16
+ end
17
+
18
+ def read(offset, size)
19
+ return ''.freeze.b if size == 0
20
+
21
+ # When we support only Ruby 2.5+, remove this and require pread.
22
+ if @fh.respond_to?(:pread)
23
+ buf = @fh.pread(size, offset)
24
+ else
25
+ @mutex.synchronize do
26
+ @fh.seek(offset, IO::SEEK_SET)
27
+ buf = @fh.read(size)
28
+ end
29
+ end
30
+
31
+ raise InvalidDatabaseError, 'The MaxMind DB file contains bad data'.freeze if buf.nil? || buf.length != size
32
+
33
+ buf
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,24 @@
1
+ module MaxMind # :nodoc:
2
+ class DB
3
+ class MemoryReader # :nodoc:
4
+ def initialize(filename, options = {})
5
+ if options[:is_buffer]
6
+ @buf = filename
7
+ @size = @buf.length
8
+ return
9
+ end
10
+
11
+ @buf = File.read(filename, mode: 'rb'.freeze).freeze
12
+ @size = @buf.length
13
+ end
14
+
15
+ attr_reader :size
16
+
17
+ def close; end
18
+
19
+ def read(offset, size)
20
+ @buf[offset, size]
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,61 @@
1
+ module MaxMind # :nodoc:
2
+ class DB
3
+ # Metadata holds metadata about a {MaxMind
4
+ # DB}[http://maxmind.github.io/MaxMind-DB/] file.
5
+ class Metadata
6
+ # The number of nodes in the database.
7
+ attr_reader :node_count
8
+
9
+ # The bit size of a record in the search tree.
10
+ attr_reader :record_size
11
+
12
+ # The IP version of the data in the database. A value of 4 means the
13
+ # database only supports IPv4. A database with a value of 6 may support
14
+ # both IPv4 and IPv6 lookups.
15
+ attr_reader :ip_version
16
+
17
+ # A string identifying the database type. e.g., "GeoIP2-City".
18
+ attr_reader :database_type
19
+
20
+ # An array of locale codes supported by the database.
21
+ attr_reader :languages
22
+
23
+ # The major version number of the binary format used when creating the
24
+ # database.
25
+ attr_reader :binary_format_major_version
26
+
27
+ # The minor version number of the binary format used when creating the
28
+ # database.
29
+ attr_reader :binary_format_minor_version
30
+
31
+ # The Unix epoch for the build time of the database.
32
+ attr_reader :build_epoch
33
+
34
+ # A hash from locales to text descriptions of the database.
35
+ attr_reader :description
36
+
37
+ # +m+ is a hash representing the metadata map.
38
+ def initialize(map)
39
+ @node_count = map['node_count']
40
+ @record_size = map['record_size']
41
+ @ip_version = map['ip_version']
42
+ @database_type = map['database_type']
43
+ @languages = map['languages']
44
+ @binary_format_major_version = map['binary_format_major_version']
45
+ @binary_format_minor_version = map['binary_format_minor_version']
46
+ @build_epoch = map['build_epoch']
47
+ @description = map['description']
48
+ end
49
+
50
+ # The size of a node in bytes.
51
+ def node_byte_size
52
+ @record_size / 4
53
+ end
54
+
55
+ # The size of the search tree in bytes.
56
+ def search_tree_size
57
+ @node_count * node_byte_size
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,19 @@
1
+ Gem::Specification.new do |s|
2
+ s.authors = ['William Storey']
3
+ s.files = Dir['**/*']
4
+ s.name = 'maxmind-db'
5
+ s.summary = 'A gem for reading MaxMind DB files.'
6
+ s.version = '1.0.0.beta'
7
+
8
+ s.description = 'A gem for reading MaxMind DB files. MaxMind DB is a binary file format that stores data indexed by IP address subnets (IPv4 or IPv6).'
9
+ s.email = 'wstorey@maxmind.com'
10
+ s.homepage = 'https://github.com/maxmind/MaxMind-DB-Reader-ruby'
11
+ s.licenses = ['Apache-2.0', 'MIT']
12
+ s.metadata = {
13
+ 'bug_tracker_uri' => 'https://github.com/maxmind/MaxMind-DB-Reader-ruby/issues',
14
+ 'changelog_uri' => 'https://github.com/maxmind/MaxMind-DB-Reader-ruby/blob/master/CHANGELOG.md',
15
+ 'documentation_uri' => 'https://github.com/maxmind/MaxMind-DB-Reader-ruby',
16
+ 'homepage_uri' => 'https://github.com/maxmind/MaxMind-DB-Reader-ruby',
17
+ 'source_code_uri' => 'https://github.com/maxmind/MaxMind-DB-Reader-ruby',
18
+ }
19
+ end
@@ -0,0 +1,4 @@
1
+ This work is licensed under the Creative Commons Attribution-ShareAlike 3.0
2
+ Unported License. To view a copy of this license, visit
3
+ http://creativecommons.org/licenses/by-sa/3.0/ or send a letter to Creative
4
+ Commons, 444 Castro Street, Suite 900, Mountain View, California, 94041, USA.
@@ -0,0 +1,558 @@
1
+ ---
2
+ layout: default
3
+ title: MaxMind DB File Format Specification
4
+ version: v2.0
5
+ ---
6
+ # MaxMind DB File Format Specification
7
+
8
+ ## Description
9
+
10
+ The MaxMind DB file format is a database format that maps IPv4 and IPv6
11
+ addresses to data records using an efficient binary search tree.
12
+
13
+ ## Version
14
+
15
+ This spec documents **version 2.0** of the MaxMind DB binary format.
16
+
17
+ The version number consists of separate major and minor version numbers. It
18
+ should not be considered a decimal number. In other words, version 2.10 comes
19
+ after version 2.9.
20
+
21
+ Code which is capable of reading a given major version of the format should
22
+ not be broken by minor version changes to the format.
23
+
24
+ ## Overview
25
+
26
+ The binary database is split into three parts:
27
+
28
+ 1. The binary search tree. Each level of the tree corresponds to a single bit
29
+ in the 128 bit representation of an IPv6 address.
30
+ 2. The data section. These are the values returned to the client for a
31
+ specific IP address, e.g. "US", "New York", or a more complex map type made up
32
+ of multiple fields.
33
+ 3. Database metadata. Information about the database itself.
34
+
35
+ ## Database Metadata
36
+
37
+ This portion of the database is stored at the end of the file. It is
38
+ documented first because understanding some of the metadata is key to
39
+ understanding how the other sections work.
40
+
41
+ This section can be found by looking for a binary sequence matching
42
+ "\xab\xcd\xefMaxMind.com". The *last* occurrence of this string in the file
43
+ marks the end of the data section and the beginning of the metadata. Since we
44
+ allow for arbitrary binary data in the data section, some other piece of data
45
+ could contain these values. This is why you need to find the last occurrence
46
+ of this sequence.
47
+
48
+ The maximum allowable size for the metadata section, including the marker that
49
+ starts the metadata, is 128KiB.
50
+
51
+ The metadata is stored as a map data structure. This structure is described
52
+ later in the spec. Changing a key's data type or removing a key would
53
+ constitute a major version change for this spec.
54
+
55
+ Except where otherwise specified, each key listed is required for the database
56
+ to be considered valid.
57
+
58
+ Adding a key constitutes a minor version change. Removing a key or changing
59
+ its type constitutes a major version change.
60
+
61
+ The list of known keys for the current version of the format is as follows:
62
+
63
+ ### node\_count
64
+
65
+ This is an unsigned 32-bit integer indicating the number of nodes in the
66
+ search tree.
67
+
68
+ ### record\_size
69
+
70
+ This is an unsigned 16-bit integer. It indicates the number of bits in a
71
+ record in the search tree. Note that each node consists of *two* records.
72
+
73
+ ### ip\_version
74
+
75
+ This is an unsigned 16-bit integer which is always 4 or 6. It indicates
76
+ whether the database contains IPv4 or IPv6 address data.
77
+
78
+ ### database\_type
79
+
80
+ This is a string that indicates the structure of each data record associated
81
+ with an IP address. The actual definition of these structures is left up to
82
+ the database creator.
83
+
84
+ Names starting with "GeoIP" are reserved for use by MaxMind (and "GeoIP" is a
85
+ trademark anyway).
86
+
87
+ ### languages
88
+
89
+ An array of strings, each of which is a locale code. A given record may
90
+ contain data items that have been localized to some or all of these
91
+ locales. Records should not contain localized data for locales not included in
92
+ this array.
93
+
94
+ This is an optional key, as this may not be relevant for all types of data.
95
+
96
+ ### binary\_format\_major\_version
97
+
98
+ This is an unsigned 16-bit integer indicating the major version number for the
99
+ database's binary format.
100
+
101
+ ### binary\_format\_minor\_version
102
+
103
+ This is an unsigned 16-bit integer indicating the minor version number for the
104
+ database's binary format.
105
+
106
+ ### build\_epoch
107
+
108
+ This is an unsigned 64-bit integer that contains the database build timestamp
109
+ as a Unix epoch value.
110
+
111
+ ### description
112
+
113
+ This key will always point to a map. The keys of that map will be language
114
+ codes, and the values will be a description in that language as a UTF-8
115
+ string.
116
+
117
+ The codes may include additional information such as script or country
118
+ identifiers, like "zh-TW" or "mn-Cyrl-MN". The additional identifiers will be
119
+ separated by a dash character ("-").
120
+
121
+ This key is optional. However, creators of databases are strongly
122
+ encouraged to include a description in at least one language.
123
+
124
+ ### Calculating the Search Tree Section Size
125
+
126
+ The formula for calculating the search tree section size *in bytes* is as
127
+ follows:
128
+
129
+ ( ( $record_size * 2 ) / 8 ) * $number_of_nodes
130
+
131
+ The end of the search tree marks the beginning of the data section.
132
+
133
+ ## Binary Search Tree Section
134
+
135
+ The database file starts with a binary search tree. The number of nodes in the
136
+ tree is dependent on how many unique netblocks are needed for the particular
137
+ database. For example, the city database needs many more small netblocks than
138
+ the country database.
139
+
140
+ The top most node is always located at the beginning of the search tree
141
+ section's address space. The top node is node 0.
142
+
143
+ Each node consists of two records, each of which is a pointer to an address in
144
+ the file.
145
+
146
+ The pointers can point to one of three things. First, it may point to another
147
+ node in the search tree address space. These pointers are followed as part of
148
+ the IP address search algorithm, described below.
149
+
150
+ The pointer can point to a value equal to `$number_of_nodes`. If this is the
151
+ case, it means that the IP address we are searching for is not in the
152
+ database.
153
+
154
+ Finally, it may point to an address in the data section. This is the data
155
+ relevant to the given netblock.
156
+
157
+ ### Node Layout
158
+
159
+ Each node in the search tree consists of two records, each of which is a
160
+ pointer. The record size varies by database, but inside a single database node
161
+ records are always the same size. A record may be anywhere from 24 to 128 bits
162
+ long, depending on the number of nodes in the tree. These pointers are
163
+ stored in big-endian format (most significant byte first).
164
+
165
+ Here are some examples of how the records are laid out in a node for 24, 28,
166
+ and 32 bit records. Larger record sizes follow this same pattern.
167
+
168
+ #### 24 bits (small database), one node is 6 bytes
169
+
170
+ | <------------- node --------------->|
171
+ | 23 .. 0 | 23 .. 0 |
172
+
173
+ #### 28 bits (medium database), one node is 7 bytes
174
+
175
+ | <------------- node --------------->|
176
+ | 23 .. 0 | 27..24 | 27..24 | 23 .. 0 |
177
+
178
+ Note, the last 4 bits of each pointer are combined into the middle byte.
179
+
180
+ #### 32 bits (large database), one node is 8 bytes
181
+
182
+ | <------------- node --------------->|
183
+ | 31 .. 0 | 31 .. 0 |
184
+
185
+ ### Search Lookup Algorithm
186
+
187
+ The first step is to convert the IP address to its big-endian binary
188
+ representation. For an IPv4 address, this becomes 32 bits. For IPv6 you get
189
+ 128 bits.
190
+
191
+ The leftmost bit corresponds to the first node in the search tree. For each
192
+ bit, a value of 0 means we choose the left record in a node, and a value of 1
193
+ means we choose the right record.
194
+
195
+ The record value is always interpreted as an unsigned integer. The maximum
196
+ size of the integer is dependent on the number of bits in a record (24, 28, or
197
+ 32).
198
+
199
+ If the record value is a number that is less than the *number of nodes* (not
200
+ in bytes, but the actual node count) in the search tree (this is stored in the
201
+ database metadata), then the value is a node number. In this case, we find
202
+ that node in the search tree and repeat the lookup algorithm from there.
203
+
204
+ If the record value is equal to the number of nodes, that means that we do not
205
+ have any data for the IP address, and the search ends here.
206
+
207
+ If the record value is *greater* than the number of nodes in the search tree,
208
+ then it is an actual pointer value pointing into the data section. The value
209
+ of the pointer is calculated from the start of the data section, *not* from
210
+ the start of the file.
211
+
212
+ In order to determine where in the data section we should start looking, we use
213
+ the following formula:
214
+
215
+ $data_section_offset = ( $record_value - $node_count ) - 16
216
+
217
+ The `16` is the size of the data section separator (see below for details).
218
+
219
+ The reason that we subtract the `$node_count` is best demonstrated by an example.
220
+
221
+ Let's assume we have a 24-bit tree with 1,000 nodes. Each node contains 48
222
+ bits, or 6 bytes. The size of the tree is 6,000 bytes.
223
+
224
+ When a record in the tree contains a number that is less than 1,000, this
225
+ is a *node number*, and we look up that node. If a record contains a value
226
+ greater than or equal to 1,016, we know that it is a data section value. We
227
+ subtract the node count (1,000) and then subtract 16 for the data section
228
+ separator, giving us the number 0, the first byte of the data section.
229
+
230
+ If a record contained the value 6,000, this formula would give us an offset of
231
+ 4,984 into the data section.
232
+
233
+ In order to determine where in the file this offset really points to, we also
234
+ need to know where the data section starts. This can be calculated by
235
+ determining the size of the search tree in bytes and then adding an additional
236
+ 16 bytes for the data section separator.
237
+
238
+ So the final formula to determine the offset in the file is:
239
+
240
+ $offset_in_file = ( $record_value - $node_count )
241
+ + $search_tree_size_in_bytes
242
+
243
+ ### IPv4 addresses in an IPv6 tree
244
+
245
+ When storing IPv4 addresses in an IPv6 tree, they are stored as-is, so they
246
+ occupy the first 32-bits of the address space (from 0 to 2**32 - 1).
247
+
248
+ Creators of databases should decide on a strategy for handling the various
249
+ mappings between IPv4 and IPv6.
250
+
251
+ The strategy that MaxMind uses for its GeoIP databases is to include a pointer
252
+ from the `::ffff:0:0/96` subnet to the root node of the IPv4 address space in
253
+ the tree. This accounts for the
254
+ [IPv4-mapped IPv6 address](http://en.wikipedia.org/wiki/IPv6#IPv4-mapped_IPv6_addresses).
255
+
256
+ MaxMind also includes a pointer from the `2002::/16` subnet to the root node
257
+ of the IPv4 address space in the tree. This accounts for the
258
+ [6to4 mapping](http://en.wikipedia.org/wiki/6to4) subnet.
259
+
260
+ Database creators are encouraged to document whether they are doing something
261
+ similar for their databases.
262
+
263
+ The Teredo subnet cannot be accounted for in the tree. Instead, code that
264
+ searches the tree can offer to decode the IPv4 portion of a Teredo address and
265
+ look that up.
266
+
267
+ ## Data Section Separator
268
+
269
+ There are 16 bytes of NULLs in between the search tree and the data
270
+ section. This separator exists in order to make it possible for a verification
271
+ tool to distinguish between the two sections.
272
+
273
+ This separator is not considered part of the data section itself. In other
274
+ words, the data section starts at `$size_of_search_tree + 16` bytes in the
275
+ file.
276
+
277
+ ## Output Data Section
278
+
279
+ Each output data field has an associated type, and that type is encoded as a
280
+ number that begins the data field. Some types are variable length. In those
281
+ cases, the type indicator is also followed by a length. The data payload
282
+ always comes at the end of the field.
283
+
284
+ All binary data is stored in big-endian format.
285
+
286
+ Note that the *interpretation* of a given data type's meaning is decided by
287
+ higher-level APIs, not by the binary format itself.
288
+
289
+ ### pointer - 1
290
+
291
+ A pointer to another part of the data section's address space. The pointer
292
+ will point to the beginning of a field. It is illegal for a pointer to point
293
+ to another pointer.
294
+
295
+ Pointer values start from the beginning of the data section, *not* the
296
+ beginning of the file.
297
+
298
+ ### UTF-8 string - 2
299
+
300
+ A variable length byte sequence that contains valid utf8. If the length is
301
+ zero then this is an empty string.
302
+
303
+ ### double - 3
304
+
305
+ This is stored as an IEEE-754 double (binary64) in big-endian format. The
306
+ length of a double is always 8 bytes.
307
+
308
+ ### bytes - 4
309
+
310
+ A variable length byte sequence containing any sort of binary data. If the
311
+ length is zero then this a zero-length byte sequence.
312
+
313
+ This is not currently used but may be used in the future to embed non-text
314
+ data (images, etc.).
315
+
316
+ ### integer formats
317
+
318
+ Integers are stored in variable length binary fields.
319
+
320
+ We support 16-bit, 32-bit, 64-bit, and 128-bit unsigned integers. We also
321
+ support 32-bit signed integers.
322
+
323
+ A 128-bit integer can use up to 16 bytes, but may use fewer. Similarly, a
324
+ 32-bit integer may use from 0-4 bytes. The number of bytes used is determined
325
+ by the length specifier in the control byte. See below for details.
326
+
327
+ A length of zero always indicates the number 0.
328
+
329
+ When storing a signed integer, the left-most bit is the sign. A 1 is negative
330
+ and a 0 is positive.
331
+
332
+ The type numbers for our integer types are:
333
+
334
+ * unsigned 16-bit int - 5
335
+ * unsigned 32-bit int - 6
336
+ * signed 32-bit int - 8
337
+ * unsigned 64-bit int - 9
338
+ * unsigned 128-bit int - 10
339
+
340
+ The unsigned 32-bit and 128-bit types may be used to store IPv4 and IPv6
341
+ addresses, respectively.
342
+
343
+ The signed 32-bit integers are stored using the 2's complement representation.
344
+
345
+ ### map - 7
346
+
347
+ A map data type contains a set of key/value pairs. Unlike other data types,
348
+ the length information for maps indicates how many key/value pairs it
349
+ contains, not its length in bytes. This size can be zero.
350
+
351
+ See below for the algorithm used to determine the number of pairs in the
352
+ hash. This algorithm is also used to determine the length of a field's
353
+ payload.
354
+
355
+ ### array - 11
356
+
357
+ An array type contains a set of ordered values. The length information for
358
+ arrays indicates how many values it contains, not its length in bytes. This
359
+ size can be zero.
360
+
361
+ This type uses the same algorithm as maps for determining the length of a
362
+ field's payload.
363
+
364
+ ### data cache container - 12
365
+
366
+ This is a special data type that marks a container used to cache repeated
367
+ data. For example, instead of repeating the string "United States" over and
368
+ over in the database, we store it in the cache container and use pointers
369
+ *into* this container instead.
370
+
371
+ Nothing in the database will ever contain a pointer to this field
372
+ itself. Instead, various fields will point into the container.
373
+
374
+ The primary reason for making this a separate data type versus simply inlining
375
+ the cached data is so that a database dumper tool can skip this cache when
376
+ dumping the data section. The cache contents will end up being dumped as
377
+ pointers into it are followed.
378
+
379
+ ### end marker - 13
380
+
381
+ The end marker marks the end of the data section. It is not strictly
382
+ necessary, but including this marker allows a data section deserializer to
383
+ process a stream of input, rather than having to find the end of the section
384
+ before beginning the deserialization.
385
+
386
+ This data type is not followed by a payload, and its size is always zero.
387
+
388
+ ### boolean - 14
389
+
390
+ A true or false value. The length information for a boolean type will always
391
+ be 0 or 1, indicating the value. There is no payload for this field.
392
+
393
+ ### float - 15
394
+
395
+ This is stored as an IEEE-754 float (binary32) in big-endian format. The
396
+ length of a float is always 4 bytes.
397
+
398
+ This type is provided primarily for completeness. Because of the way floating
399
+ point numbers are stored, this type can easily lose precision when serialized
400
+ and then deserialized. If this is an issue for you, consider using a double
401
+ instead.
402
+
403
+ ### Data Field Format
404
+
405
+ Each field starts with a control byte. This control byte provides information
406
+ about the field's data type and payload size.
407
+
408
+ The first three bits of the control byte tell you what type the field is. If
409
+ these bits are all 0, then this is an "extended" type, which means that the
410
+ *next* byte contains the actual type. Otherwise, the first three bits will
411
+ contain a number from 1 to 7, the actual type for the field.
412
+
413
+ We've tried to assign the most commonly used types as numbers 1-7 as an
414
+ optimization.
415
+
416
+ With an extended type, the type number in the second byte is the number
417
+ minus 7. In other words, an array (type 11) will be stored with a 0 for the
418
+ type in the first byte and a 4 in the second.
419
+
420
+ Here is an example of how the control byte may combine with the next byte to
421
+ tell us the type:
422
+
423
+ 001XXXXX pointer
424
+ 010XXXXX UTF-8 string
425
+ 110XXXXX unsigned 32-bit int (ASCII)
426
+ 000XXXXX 00000011 unsigned 128-bit int (binary)
427
+ 000XXXXX 00000100 array
428
+ 000XXXXX 00000110 end marker
429
+
430
+ #### Payload Size
431
+
432
+ The next five bits in the control byte tell you how long the data field's
433
+ payload is, except for maps and pointers. Maps and pointers use this size
434
+ information a bit differently. See below.
435
+
436
+ If the five bits are smaller than 29, then those bits are the payload size in
437
+ bytes. For example:
438
+
439
+ 01000010 UTF-8 string - 2 bytes long
440
+ 01011100 UTF-8 string - 28 bytes long
441
+ 11000001 unsigned 32-bit int - 1 byte long
442
+ 00000011 00000011 unsigned 128-bit int - 3 bytes long
443
+
444
+ If the five bits are equal to 29, 30, or 31, then use the following algorithm
445
+ to calculate the payload size.
446
+
447
+ If the value is 29, then the size is 29 + *the next byte after the type
448
+ specifying bytes as an unsigned integer*.
449
+
450
+ If the value is 30, then the size is 285 + *the next two bytes after the type
451
+ specifying bytes as a single unsigned integer*.
452
+
453
+ If the value is 31, then the size is 65,821 + *the next three bytes after the
454
+ type specifying bytes as a single unsigned integer*.
455
+
456
+ Some examples:
457
+
458
+ 01011101 00110011 UTF-8 string - 80 bytes long
459
+
460
+ In this case, the last five bits of the control byte equal 29. We treat the
461
+ next byte as an unsigned integer. The next byte is 51, so the total size is
462
+ (29 + 51) = 80.
463
+
464
+ 01011110 00110011 00110011 UTF-8 string - 13,392 bytes long
465
+
466
+ The last five bits of the control byte equal 30. We treat the next two bytes
467
+ as a single unsigned integer. The next two bytes equal 13,107, so the total
468
+ size is (285 + 13,107) = 13,392.
469
+
470
+ 01011111 00110011 00110011 00110011 UTF-8 string - 3,421,264 bytes long
471
+
472
+ The last five bits of the control byte equal 31. We treat the next three bytes
473
+ as a single unsigned integer. The next three bytes equal 3,355,443, so the
474
+ total size is (65,821 + 3,355,443) = 3,421,264.
475
+
476
+ This means that the maximum payload size for a single field is 16,843,036
477
+ bytes.
478
+
479
+ The binary number types always have a known size, but for consistency's sake,
480
+ the control byte will always specify the correct size for these types.
481
+
482
+ #### Maps
483
+
484
+ Maps use the size in the control byte (and any following bytes) to indicate
485
+ the number of key/value pairs in the map, not the size of the payload in
486
+ bytes.
487
+
488
+ This means that the maximum number of pairs for a single map is 16,843,036.
489
+
490
+ Maps are laid out with each key followed by its value, followed by the next
491
+ pair, etc.
492
+
493
+ The keys are **always** UTF-8 strings. The values may be any data type,
494
+ including maps or pointers.
495
+
496
+ Once we know the number of pairs, we can look at each pair in turn to
497
+ determine the size of the key and the key name, as well as the value's type
498
+ and payload.
499
+
500
+ #### Pointers
501
+
502
+ Pointers use the last five bits in the control byte to calculate the pointer
503
+ value.
504
+
505
+ To calculate the pointer value, we start by subdividing the five bits into two
506
+ groups. The first two bits indicate the size, and the next three bits are part
507
+ of the value, so we end up with a control byte breaking down like this:
508
+ 001SSVVV.
509
+
510
+ The size can be 0, 1, 2, or 3.
511
+
512
+ If the size is 0, the pointer is built by appending the next byte to the last
513
+ three bits to produce an 11-bit value.
514
+
515
+ If the size is 1, the pointer is built by appending the next two bytes to the
516
+ last three bits to produce a 19-bit value + 2048.
517
+
518
+ If the size is 2, the pointer is built by appending the next three bytes to the
519
+ last three bits to produce a 27-bit value + 526336.
520
+
521
+ Finally, if the size is 3, the pointer's value is contained in the next four
522
+ bytes as a 32-bit value. In this case, the last three bits of the control byte
523
+ are ignored.
524
+
525
+ This means that we are limited to 4GB of address space for pointers, so the
526
+ data section size for the database is limited to 4GB.
527
+
528
+ ## Reference Implementations
529
+
530
+ ### Writer
531
+
532
+ * [Perl](https://github.com/maxmind/MaxMind-DB-Writer-perl)
533
+
534
+ ### Reader
535
+
536
+ * [C](https://github.com/maxmind/libmaxminddb)
537
+ * [C#](https://github.com/maxmind/MaxMind-DB-Reader-dotnet)
538
+ * [Java](https://github.com/maxmind/MaxMind-DB-Reader-java)
539
+ * [Perl](https://github.com/maxmind/MaxMind-DB-Reader-perl)
540
+ * [PHP](https://github.com/maxmind/MaxMind-DB-Reader-php)
541
+ * [Python](https://github.com/maxmind/MaxMind-DB-Reader-python)
542
+
543
+ ## Authors
544
+
545
+ This specification was created by the following authors:
546
+
547
+ * Greg Oschwald \<goschwald@maxmind.com\>
548
+ * Dave Rolsky \<drolsky@maxmind.com\>
549
+ * Boris Zentner \<bzentner@maxmind.com\>
550
+
551
+ ## License
552
+
553
+ This work is licensed under the Creative Commons Attribution-ShareAlike 3.0
554
+ Unported License. To view a copy of this license, visit
555
+ [http://creativecommons.org/licenses/by-sa/3.0/](http://creativecommons.org/licenses/by-sa/3.0/)
556
+ or send a letter to Creative Commons, 444 Castro Street, Suite 900, Mountain
557
+ View, California, 94041, USA
558
+