geoip2_c 0.3.3 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (107) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ubuntu.yml +33 -0
  3. data/.github/workflows/windows.yml +52 -0
  4. data/README.md +3 -9
  5. data/docker-compose.yml +7 -0
  6. data/dockerfiles/Dockerfile-ruby2.7 +8 -0
  7. data/ext/geoip2/extconf.rb +11 -4
  8. data/ext/geoip2/geoip2.c +58 -15
  9. data/ext/geoip2/libmaxminddb/t/libtap/.gitignore +13 -0
  10. data/ext/geoip2/libmaxminddb/t/libtap/.travis.yml +13 -0
  11. data/ext/geoip2/libmaxminddb/t/libtap/COPYING +165 -0
  12. data/ext/geoip2/libmaxminddb/t/libtap/INSTALL +41 -0
  13. data/ext/geoip2/libmaxminddb/t/libtap/Makefile +72 -0
  14. data/ext/geoip2/libmaxminddb/t/libtap/Makefile.win +37 -0
  15. data/ext/geoip2/libmaxminddb/t/libtap/README.md +268 -0
  16. data/ext/geoip2/libmaxminddb/t/libtap/t/cmp_mem.c +20 -0
  17. data/ext/geoip2/libmaxminddb/t/libtap/t/cmp_mem.expected +28 -0
  18. data/ext/geoip2/libmaxminddb/t/libtap/t/cmpok.c +16 -0
  19. data/ext/geoip2/libmaxminddb/t/libtap/t/cmpok.expected +37 -0
  20. data/ext/geoip2/libmaxminddb/t/libtap/t/diag.c +10 -0
  21. data/ext/geoip2/libmaxminddb/t/libtap/t/diag.expected +2 -0
  22. data/ext/geoip2/libmaxminddb/t/libtap/t/diesok.c +14 -0
  23. data/ext/geoip2/libmaxminddb/t/libtap/t/diesok.expected +6 -0
  24. data/ext/geoip2/libmaxminddb/t/libtap/t/is.c +24 -0
  25. data/ext/geoip2/libmaxminddb/t/libtap/t/is.expected +58 -0
  26. data/ext/geoip2/libmaxminddb/t/libtap/t/like.c +10 -0
  27. data/ext/geoip2/libmaxminddb/t/libtap/t/like.expected +4 -0
  28. data/ext/geoip2/libmaxminddb/t/libtap/t/simple.c +31 -0
  29. data/ext/geoip2/libmaxminddb/t/libtap/t/simple.expected +32 -0
  30. data/ext/geoip2/libmaxminddb/t/libtap/t/skip.c +23 -0
  31. data/ext/geoip2/libmaxminddb/t/libtap/t/skip.expected +9 -0
  32. data/ext/geoip2/libmaxminddb/t/libtap/t/synopsis.c +13 -0
  33. data/ext/geoip2/libmaxminddb/t/libtap/t/synopsis.expected +9 -0
  34. data/ext/geoip2/libmaxminddb/t/libtap/t/test.c +28 -0
  35. data/ext/geoip2/libmaxminddb/t/libtap/t/todo.c +17 -0
  36. data/ext/geoip2/libmaxminddb/t/libtap/t/todo.expected +11 -0
  37. data/ext/geoip2/libmaxminddb/t/libtap/tap.c +354 -0
  38. data/ext/geoip2/libmaxminddb/t/libtap/tap.h +115 -0
  39. data/ext/geoip2/libmaxminddb/t/maxmind-db/.gitattributes +1 -0
  40. data/ext/geoip2/libmaxminddb/t/maxmind-db/.gitconfig +2 -0
  41. data/ext/geoip2/libmaxminddb/t/maxmind-db/.gitignore +2 -0
  42. data/ext/geoip2/libmaxminddb/t/maxmind-db/.perltidyallrc +11 -0
  43. data/ext/geoip2/libmaxminddb/t/maxmind-db/.tidyallrc +7 -0
  44. data/ext/geoip2/libmaxminddb/t/maxmind-db/LICENSE +4 -0
  45. data/ext/geoip2/libmaxminddb/t/maxmind-db/MaxMind-DB-spec.md +558 -0
  46. data/ext/geoip2/libmaxminddb/t/maxmind-db/README.md +4 -0
  47. data/ext/geoip2/libmaxminddb/t/maxmind-db/bad-data/README.md +7 -0
  48. data/ext/geoip2/libmaxminddb/t/maxmind-db/bad-data/libmaxminddb/libmaxminddb-offset-integer-overflow.mmdb +0 -0
  49. data/ext/geoip2/libmaxminddb/t/maxmind-db/bad-data/maxminddb-golang/cyclic-data-structure.mmdb +0 -0
  50. data/ext/geoip2/libmaxminddb/t/maxmind-db/bad-data/maxminddb-golang/invalid-bytes-length.mmdb +1 -0
  51. data/ext/geoip2/libmaxminddb/t/maxmind-db/bad-data/maxminddb-golang/invalid-data-record-offset.mmdb +0 -0
  52. data/ext/geoip2/libmaxminddb/t/maxmind-db/bad-data/maxminddb-golang/invalid-map-key-length.mmdb +0 -0
  53. data/ext/geoip2/libmaxminddb/t/maxmind-db/bad-data/maxminddb-golang/invalid-string-length.mmdb +1 -0
  54. data/ext/geoip2/libmaxminddb/t/maxmind-db/bad-data/maxminddb-golang/metadata-is-an-uint128.mmdb +1 -0
  55. data/ext/geoip2/libmaxminddb/t/maxmind-db/bad-data/maxminddb-golang/unexpected-bytes.mmdb +0 -0
  56. data/ext/geoip2/libmaxminddb/t/maxmind-db/perltidyrc +12 -0
  57. data/ext/geoip2/libmaxminddb/t/maxmind-db/source-data/GeoIP2-Anonymous-IP-Test.json +32 -0
  58. data/ext/geoip2/libmaxminddb/t/maxmind-db/source-data/GeoIP2-City-Test.json +12616 -0
  59. data/ext/geoip2/libmaxminddb/t/maxmind-db/source-data/GeoIP2-Connection-Type-Test.json +102 -0
  60. data/ext/geoip2/libmaxminddb/t/maxmind-db/source-data/GeoIP2-Country-Test.json +10975 -0
  61. data/ext/geoip2/libmaxminddb/t/maxmind-db/source-data/GeoIP2-DensityIncome-Test.json +14 -0
  62. data/ext/geoip2/libmaxminddb/t/maxmind-db/source-data/GeoIP2-Domain-Test.json +452 -0
  63. data/ext/geoip2/libmaxminddb/t/maxmind-db/source-data/GeoIP2-Enterprise-Test.json +666 -0
  64. data/ext/geoip2/libmaxminddb/t/maxmind-db/source-data/GeoIP2-ISP-Test.json +12585 -0
  65. data/ext/geoip2/libmaxminddb/t/maxmind-db/source-data/GeoIP2-Precision-Enterprise-Test.json +1035 -0
  66. data/ext/geoip2/libmaxminddb/t/maxmind-db/source-data/GeoLite2-ASN-Test.json +37 -0
  67. data/ext/geoip2/libmaxminddb/t/maxmind-db/source-data/README +13 -0
  68. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/GeoIP2-Anonymous-IP-Test.mmdb +0 -0
  69. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/GeoIP2-City-Test-Broken-Double-Format.mmdb +0 -0
  70. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/GeoIP2-City-Test-Invalid-Node-Count.mmdb +0 -0
  71. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/GeoIP2-City-Test.mmdb +0 -0
  72. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/GeoIP2-Connection-Type-Test.mmdb +0 -0
  73. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/GeoIP2-Country-Test.mmdb +0 -0
  74. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/GeoIP2-DensityIncome-Test.mmdb +0 -0
  75. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/GeoIP2-Domain-Test.mmdb +0 -0
  76. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/GeoIP2-Enterprise-Test.mmdb +0 -0
  77. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/GeoIP2-ISP-Test.mmdb +0 -0
  78. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/GeoIP2-Precision-Enterprise-Test.mmdb +0 -0
  79. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/GeoLite2-ASN-Test.mmdb +0 -0
  80. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-no-ipv4-search-tree.mmdb +0 -0
  81. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-string-value-entries.mmdb +0 -0
  82. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-test-broken-pointers-24.mmdb +0 -0
  83. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-test-broken-search-tree-24.mmdb +0 -0
  84. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-test-decoder.mmdb +0 -0
  85. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-test-ipv4-24.mmdb +0 -0
  86. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-test-ipv4-28.mmdb +0 -0
  87. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-test-ipv4-32.mmdb +0 -0
  88. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-test-ipv6-24.mmdb +0 -0
  89. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-test-ipv6-28.mmdb +0 -0
  90. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-test-ipv6-32.mmdb +0 -0
  91. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-test-metadata-pointers.mmdb +0 -0
  92. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-test-mixed-24.mmdb +0 -0
  93. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-test-mixed-28.mmdb +0 -0
  94. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-test-mixed-32.mmdb +0 -0
  95. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/MaxMind-DB-test-nested.mmdb +0 -0
  96. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/README.md +26 -0
  97. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/maps-with-pointers.raw +0 -0
  98. data/ext/geoip2/libmaxminddb/t/maxmind-db/test-data/write-test-data.pl +614 -0
  99. data/ext/geoip2/libmaxminddb/t/maxmind-db/tidyall.ini +5 -0
  100. data/geoip2_c.gemspec +2 -3
  101. data/lib/geoip2/database.rb +4 -0
  102. data/lib/geoip2/version.rb +1 -1
  103. metadata +108 -18
  104. data/.travis.yml +0 -30
  105. data/Appraisals +0 -7
  106. data/gemfiles/ruby_2.1.gemfile +0 -7
  107. data/gemfiles/ruby_2.2.gemfile +0 -7
@@ -0,0 +1,558 @@
1
+ ---
2
+ layout: default
3
+ title: MaxMind DB File Format Specification
4
+ version: v2.0
5
+ ---
6
+ # MaxMind DB File Format Specification
7
+
8
+ ## Description
9
+
10
+ The MaxMind DB file format is a database format that maps IPv4 and IPv6
11
+ addresses to data records using an efficient binary search tree.
12
+
13
+ ## Version
14
+
15
+ This spec documents **version 2.0** of the MaxMind DB binary format.
16
+
17
+ The version number consists of separate major and minor version numbers. It
18
+ should not be considered a decimal number. In other words, version 2.10 comes
19
+ after version 2.9.
20
+
21
+ Code which is capable of reading a given major version of the format should
22
+ not be broken by minor version changes to the format.
23
+
24
+ ## Overview
25
+
26
+ The binary database is split into three parts:
27
+
28
+ 1. The binary search tree. Each level of the tree corresponds to a single bit
29
+ in the 128 bit representation of an IPv6 address.
30
+ 2. The data section. These are the values returned to the client for a
31
+ specific IP address, e.g. "US", "New York", or a more complex map type made up
32
+ of multiple fields.
33
+ 3. Database metadata. Information about the database itself.
34
+
35
+ ## Database Metadata
36
+
37
+ This portion of the database is stored at the end of the file. It is
38
+ documented first because understanding some of the metadata is key to
39
+ understanding how the other sections work.
40
+
41
+ This section can be found by looking for a binary sequence matching
42
+ "\xab\xcd\xefMaxMind.com". The *last* occurrence of this string in the file
43
+ marks the end of the data section and the beginning of the metadata. Since we
44
+ allow for arbitrary binary data in the data section, some other piece of data
45
+ could contain these values. This is why you need to find the last occurrence
46
+ of this sequence.
47
+
48
+ The maximum allowable size for the metadata section, including the marker that
49
+ starts the metadata, is 128kb.
50
+
51
+ The metadata is stored as a map data structure. This structure is described
52
+ later in the spec. Changing a key's data type or removing a key would
53
+ consistute a major version change for this spec.
54
+
55
+ Except where otherwise specified, each key listed is required for the database
56
+ to be considered valid.
57
+
58
+ Adding a key constitutes a minor version change. Removing a key or changing
59
+ its type constitutes a major version change.
60
+
61
+ The list of known keys for the current version of the format is as follows:
62
+
63
+ ### node\_count
64
+
65
+ This is an unsigned 32-bit integer indicating the number of nodes in the
66
+ search tree.
67
+
68
+ ### record\_size
69
+
70
+ This is an unsigned 16-bit integer. It indicates the number of bits in a
71
+ record in the search tree. Note that each node consists of *two* records.
72
+
73
+ ### ip\_version
74
+
75
+ This is an unsigned 16-bit integer which is always 4 or 6. It indicates
76
+ whether the database contains IPv4 or IPv6 address data.
77
+
78
+ ### database\_type
79
+
80
+ This is a string that indicates the structure of each data record associated
81
+ with an IP address. The actual definition of these structures is left up to
82
+ the database creator.
83
+
84
+ Names starting with "GeoIP" are reserved for use by MaxMind (and "GeoIP" is a
85
+ trademark anyway).
86
+
87
+ ### languages
88
+
89
+ An array of strings, each of which is a locale code. A given record may
90
+ contain data items that have been localized to some or all of these
91
+ locales. Records should not contain localized data for locales not included in
92
+ this array.
93
+
94
+ This is an optional key, as this may not be relevant for all types of data.
95
+
96
+ ### binary\_format\_major\_version
97
+
98
+ This is an unsigned 16-bit integer indicating the major version number for the
99
+ database's binary format.
100
+
101
+ ### binary\_format\_minor\_version
102
+
103
+ This is an unsigned 16-bit integer indicating the minor version number for the
104
+ database's binary format.
105
+
106
+ ### build\_epoch
107
+
108
+ This is an unsigned 64-bit integer that contains the database build timestamp
109
+ as a Unix epoch value.
110
+
111
+ ### description
112
+
113
+ This key will always point to a map. The keys of that map will be language
114
+ codes, and the values will be a description in that language as a UTF-8
115
+ string.
116
+
117
+ The codes may include additional information such as script or country
118
+ identifiers, like "zh-TW" or "mn-Cyrl-MN". The additional identifiers will be
119
+ separated by a dash character ("-").
120
+
121
+ This key is optional. However, creators of databases are strongly
122
+ encouraged to include a description in at least one language.
123
+
124
+ ### Calculating the Search Tree Section Size
125
+
126
+ The formula for calculating the search tree section size *in bytes* is as
127
+ follows:
128
+
129
+ ( ( $record_size * 2 ) / 8 ) * $number_of_nodes
130
+
131
+ The end of the search tree marks the beginning of the data section.
132
+
133
+ ## Binary Search Tree Section
134
+
135
+ The database file starts with a binary search tree. The number of nodes in the
136
+ tree is dependent on how many unique netblocks are needed for the particular
137
+ database. For example, the city database needs many more small netblocks than
138
+ the country database.
139
+
140
+ The top most node is always located at the beginning of the search tree
141
+ section's address space. The top node is node 0.
142
+
143
+ Each node consists of two records, each of which is a pointer to an address in
144
+ the file.
145
+
146
+ The pointers can point to one of three things. First, it may point to another
147
+ node in the search tree address space. These pointers are followed as part of
148
+ the IP address search algorithm, described below.
149
+
150
+ The pointer can point to a value equal to `$number_of_nodes`. If this is the
151
+ case, it means that the IP address we are searching for is not in the
152
+ database.
153
+
154
+ Finally, it may point to an address in the data section. This is the data
155
+ relevant to the given netblock.
156
+
157
+ ### Node Layout
158
+
159
+ Each node in the search tree consists of two records, each of which is a
160
+ pointer. The record size varies by database, but inside a single database node
161
+ records are always the same size. A record may be anywhere from 24 to 128 bits
162
+ long, dependending on the number of nodes in the tree. These pointers are
163
+ stored in big-endian format (most significant byte first).
164
+
165
+ Here are some examples of how the records are laid out in a node for 24, 28,
166
+ and 32 bit records. Larger record sizes follow this same pattern.
167
+
168
+ #### 24 bits (small database), one node is 6 bytes
169
+
170
+ | <------------- node --------------->|
171
+ | 23 .. 0 | 23 .. 0 |
172
+
173
+ #### 28 bits (medium database), one node is 7 bytes
174
+
175
+ | <------------- node --------------->|
176
+ | 23 .. 0 | 27..24 | 27..24 | 23 .. 0 |
177
+
178
+ Note, the last 4 bits of each pointer are combined into the middle byte.
179
+
180
+ #### 32 bits (large database), one node is 8 bytes
181
+
182
+ | <------------- node --------------->|
183
+ | 31 .. 0 | 31 .. 0 |
184
+
185
+ ### Search Lookup Algorithm
186
+
187
+ The first step is to convert the IP address to its big-endian binary
188
+ representation. For an IPv4 address, this becomes 32 bits. For IPv6 you get
189
+ 128 bits.
190
+
191
+ The leftmost bit corresponds to the first node in the search tree. For each
192
+ bit, a value of 0 means we choose the left record in a node, and a value of 1
193
+ means we choose the right record.
194
+
195
+ The record value is always interpreted as an unsigned integer. The maximum
196
+ size of the integer is dependent on the number of bits in a record (24, 28, or
197
+ 32).
198
+
199
+ If the record value is a number that is less than the *number of nodes* (not
200
+ in bytes, but the actual node count) in the search tree (this is stored in the
201
+ database metadata), then the value is a node number. In this case, we find
202
+ that node in the search tree and repeat the lookup algorithm from there.
203
+
204
+ If the record value is equal to the number of nodes, that means that we do not
205
+ have any data for the IP address, and the search ends here.
206
+
207
+ If the record value is *greater* than the number of nodes in the search tree,
208
+ then it is an actual pointer value pointing into the data section. The value
209
+ of the pointer is calculated from the start of the data section, *not* from
210
+ the start of the file.
211
+
212
+ In order to determine where in the data section we should start looking, we use
213
+ the following formula:
214
+
215
+ $data_section_offset = ( $record_value - $node_count ) - 16
216
+
217
+ The `16` is the size of the data section separator (see below for details).
218
+
219
+ The reason that we subtract the `$node_count` is best demonstrated by an example.
220
+
221
+ Let's assume we have a 24-bit tree with 1,000 nodes. Each node contains 48
222
+ bits, or 6 bytes. The size of the tree is 6,000 bytes.
223
+
224
+ When a record in the tree contains a number that is < 1,000, this is a *node
225
+ number*, and we look up that node. If a record contains a value >= 1,016, we
226
+ know that it is a data section value. We subtract the node count (1,000) and
227
+ then subtract 16 for the data section separator, giving us the number 0, the
228
+ first byte of the data section.
229
+
230
+ If a record contained the value 6,000, this formula would give us an offset of
231
+ 4,984 into the data section.
232
+
233
+ In order to determine where in the file this offset really points to, we also
234
+ need to know where the data section starts. This can be calculated by
235
+ determining the size of the search tree in bytes and then adding an additional
236
+ 16 bytes for the data section separator.
237
+
238
+ So the final formula to determine the offset in the file is:
239
+
240
+ $offset_in_file = ( $record_value - $node_count )
241
+ + $search_tree_size_in_bytes
242
+
243
+ ### IPv4 addresses in an IPv6 tree
244
+
245
+ When storing IPv4 addresses in an IPv6 tree, they are stored as-is, so they
246
+ occupy the first 32-bits of the address space (from 0 to 2**32 - 1).
247
+
248
+ Creators of databases should decide on a strategy for handling the various
249
+ mappings between IPv4 and IPv6.
250
+
251
+ The strategy that MaxMind uses for its GeoIP databases is to include a pointer
252
+ from the `::ffff:0:0/96` subnet to the root node of the IPv4 address space in
253
+ the tree. This accounts for the
254
+ [IPv4-mapped IPv6 address](http://en.wikipedia.org/wiki/IPv6#IPv4-mapped_IPv6_addresses).
255
+
256
+ MaxMind also includes a pointer from the `2002::/16` subnet to the root node
257
+ of the IPv4 address space in the tree. This accounts for the
258
+ [6to4 mapping](http://en.wikipedia.org/wiki/6to4) subnet.
259
+
260
+ Database creators are encouraged to document whether they are doing something
261
+ similar for their databases.
262
+
263
+ The Teredo subnet cannot be accounted for in the tree. Instead, code that
264
+ searches the tree can offer to decode the IPv4 portion of a Teredo address and
265
+ look that up.
266
+
267
+ ## Data Section Separator
268
+
269
+ There are 16 bytes of NULLs in between the search tree and the data
270
+ section. This separator exists in order to make it possible for a verification
271
+ tool to distinguish between the two sections.
272
+
273
+ This separator is not considered part of the data section itself. In other
274
+ words, the data section starts at `$size\_of\_search_tree + 16" bytes in the
275
+ file.
276
+
277
+ ## Output Data Section
278
+
279
+ Each output data field has an associated type, and that type is encoded as a
280
+ number that begins the data field. Some types are variable length. In those
281
+ cases, the type indicator is also followed by a length. The data payload
282
+ always comes at the end of the field.
283
+
284
+ All binary data is stored in big-endian format.
285
+
286
+ Note that the *interpretation* of a given data type's meaning is decided by
287
+ higher-level APIs, not by the binary format itself.
288
+
289
+ ### pointer - 1
290
+
291
+ A pointer to another part of the data section's address space. The pointer
292
+ will point to the beginning of a field. It is illegal for a pointer to point
293
+ to another pointer.
294
+
295
+ Pointer values start from the beginning of the data section, *not* the
296
+ beginning of the file.
297
+
298
+ ### UTF-8 string - 2
299
+
300
+ A variable length byte sequence that contains valid utf8. If the length is
301
+ zero then this is an empty string.
302
+
303
+ ### double - 3
304
+
305
+ This is stored as an IEEE-754 double (binary64) in big-endian format. The
306
+ length of a double is always 8 bytes.
307
+
308
+ ### bytes - 4
309
+
310
+ A variable length byte sequence containing any sort of binary data. If the
311
+ length is zero then this a zero-length byte sequence.
312
+
313
+ This is not currently used but may be used in the future to embed non-text
314
+ data (images, etc.).
315
+
316
+ ### integer formats
317
+
318
+ Integers are stored in variable length binary fields.
319
+
320
+ We support 16-bit, 32-bit, 64-bit, and 128-bit unsigned integers. We also
321
+ support 32-bit signed integers.
322
+
323
+ A 128-bit integer can use up to 16 bytes, but may use fewer. Similarly, a
324
+ 32-bit integer may use from 0-4 bytes. The number of bytes used is determined
325
+ by the length specifier in the control byte. See below for details.
326
+
327
+ A length of zero always indicates the number 0.
328
+
329
+ When storing a signed integer, the left-most bit is the sign. A 1 is negative
330
+ and a 0 is positive.
331
+
332
+ The type numbers for our integer types are:
333
+
334
+ * unsigned 16-bit int - 5
335
+ * unsigned 32-bit int - 6
336
+ * signed 32-bit int - 8
337
+ * unsigned 64-bit int - 9
338
+ * unsigned 128-bit int - 10
339
+
340
+ The unsigned 32-bit and 128-bit types may be used to store IPv4 and IPv6
341
+ addresses, respectively.
342
+
343
+ The signed 32-bit integers are stored using the 2's complement representation.
344
+
345
+ ### map - 7
346
+
347
+ A map data type contains a set of key/value pairs. Unlike other data types,
348
+ the length information for maps indicates how many key/value pairs it
349
+ contains, not its length in bytes. This size can be zero.
350
+
351
+ See below for the algorithm used to determine the number of pairs in the
352
+ hash. This algorithm is also used to determine the length of a field's
353
+ payload.
354
+
355
+ ### array - 11
356
+
357
+ An array type contains a set of ordered values. The length information for
358
+ arrays indicates how many values it contains, not its length in bytes. This
359
+ size can be zero.
360
+
361
+ This type uses the same algorithm as maps for determining the length of a
362
+ field's payload.
363
+
364
+ ### data cache container - 12
365
+
366
+ This is a special data type that marks a container used to cache repeated
367
+ data. For example, instead of repeating the string "United States" over and
368
+ over in the database, we store it in the cache container and use pointers
369
+ *into* this container instead.
370
+
371
+ Nothing in the database will ever contain a pointer to the this field
372
+ itself. Instead, various fields will point into the container.
373
+
374
+ The primary reason for making this a separate data type versus simply inlining
375
+ the cached data is so that a database dumper tool can skip this cache when
376
+ dumping the data section. The cache contents will end up being dumped as
377
+ pointers into it are followed.
378
+
379
+ ### end marker - 13
380
+
381
+ The end marker marks the end of the data section. It is not strictly
382
+ necessary, but including this marker allows a data section deserializer to
383
+ process a stream of input, rather than having to find the end of the section
384
+ before beginning the deserialization.
385
+
386
+ This data type is not followed by a payload, and its size is always zero.
387
+
388
+ ### boolean - 14
389
+
390
+ A true or false value. The length information for a boolean type will always
391
+ be 0 or 1, indicating the value. There is no payload for this field.
392
+
393
+ ### float - 15
394
+
395
+ This is stored as an IEEE-754 float (binary32) in big-endian format. The
396
+ length of a float is always 4 bytes.
397
+
398
+ This type is provided primarily for completeness. Because of the way floating
399
+ point numbers are stored, this type can easily lose precision when serialized
400
+ and then deserialized. If this is an issue for you, consider using a double
401
+ instead.
402
+
403
+ ### Data Field Format
404
+
405
+ Each field starts with a control byte. This control byte provides information
406
+ about the field's data type and payload size.
407
+
408
+ The first three bits of the control byte tell you what type the field is. If
409
+ these bits are all 0, then this is an "extended" type, which means that the
410
+ *next* byte contains the actual type. Otherwise, the first three bits will
411
+ contain a number from 1 to 7, the actual type for the field.
412
+
413
+ We've tried to assign the most commonly used types as numbers 1-7 as an
414
+ optimization.
415
+
416
+ With an extended type, the type number in the second byte is the number minus
417
+ 7. In other words, an array (type 11) will be stored with a 0 for the type in
418
+ the first byte and a 4 in the second.
419
+
420
+ Here is an example of how the control byte may combine with the next byte to
421
+ tell us the type:
422
+
423
+ 001XXXXX pointer
424
+ 010XXXXX UTF-8 string
425
+ 010XXXXX unsigned 32-bit int (ASCII)
426
+ 000XXXXX 00000011 unsigned 128-bit int (binary)
427
+ 000XXXXX 00000100 array
428
+ 000XXXXX 00000110 end marker
429
+
430
+ #### Payload Size
431
+
432
+ The next five bits in the control byte tell you how long the data field's
433
+ payload is, except for maps and pointers. Maps and pointers use this size
434
+ information a bit differently. See below.
435
+
436
+ If the five bits are smaller than 29, then those bits are the payload size in
437
+ bytes. For example:
438
+
439
+ 01000010 UTF-8 string - 2 bytes long
440
+ 01011100 UTF-8 string - 28 bytes long
441
+ 11000001 unsigned 32-bit int - 1 byte long
442
+ 00000011 00000011 unsigned 128-bit int - 3 bytes long
443
+
444
+ If the five bits are equal to 29, 30, or 31, then use the following algorithm
445
+ to calculate the payload size.
446
+
447
+ If the value is 29, then the size is 29 + *the next byte after the type
448
+ specifying bytes as an unsigned integer*.
449
+
450
+ If the value is 30, then the size is 285 + *the next two bytes after the type
451
+ specifying bytes as a single unsigned integer*.
452
+
453
+ If the value is 31, then the size is 65,821 + *the next three bytes after the
454
+ type specifying bytes as a single unsigned integer*.
455
+
456
+ Some examples:
457
+
458
+ 01011101 00110011 UTF-8 string - 80 bytes long
459
+
460
+ In this case, the last five bits of the control byte equal 29. We treat the
461
+ next byte as an unsigned integer. The next byte is 51, so the total size is
462
+ (29 + 51) = 80.
463
+
464
+ 01011110 00110011 00110011 UTF-8 string - 13,392 bytes long
465
+
466
+ The last five bits of the control byte equal 30. We treat the next two bytes
467
+ as a single unsigned integer. The next two bytes equal 13,107, so the total
468
+ size is (285 + 13,107) = 13,392.
469
+
470
+ 01011111 00110011 00110011 00110011 UTF-8 string - 3,421,264 bytes long
471
+
472
+ The last five bits of the control byte equal 31. We treat the next three bytes
473
+ as a single unsigned integer. The next three bytes equal 3,355,443, so the
474
+ total size is (65,821 + 3,355,443) = 3,421,264.
475
+
476
+ This means that the maximum payload size for a single field is 16,843,036
477
+ bytes.
478
+
479
+ The binary number types always have a known size, but for consistency's sake,
480
+ the control byte will always specify the correct size for these types.
481
+
482
+ #### Maps
483
+
484
+ Maps use the size in the control byte (and any following bytes) to indicate
485
+ the number of key/value pairs in the map, not the size of the payload in
486
+ bytes.
487
+
488
+ This means that the maximum number of pairs for a single map is 16,843,036.
489
+
490
+ Maps are laid out with each key followed by its value, followed by the next
491
+ pair, etc.
492
+
493
+ The keys are **always** UTF-8 strings. The values may be any data type,
494
+ including maps or pointers.
495
+
496
+ Once we know the number of pairs, we can look at each pair in turn to
497
+ determine the size of the key and the key name, as well as the value's type
498
+ and payload.
499
+
500
+ #### Pointers
501
+
502
+ Pointers use the last five bits in the control byte to calculate the pointer
503
+ value.
504
+
505
+ To calculate the pointer value, we start by subdiving the five bits into two
506
+ groups. The first two bits indicate the size, and the next three bits are part
507
+ of the value, so we end up with a control byte breaking down like this:
508
+ 001SSVVV.
509
+
510
+ The size can be 0, 1, 2, or 3.
511
+
512
+ If the size is 0, the pointer is built by appending the next byte to the last
513
+ three bits to produce an 11-bit value.
514
+
515
+ If the size is 1, the pointer is built by appending the next two bytes to the
516
+ last three bits to produce a 19-bit value + 2048.
517
+
518
+ If the size is 2, the pointer is built by appending the next three bytes to the
519
+ last three bits to produce a 27-bit value + 526336.
520
+
521
+ Finally, if the size is 3, the pointer's value is contained in the next four
522
+ bytes as a 32-bit value. In this case, the last three bits of the control byte
523
+ are ignored.
524
+
525
+ This means that we are limited to 4GB of address space for pointers, so the
526
+ data section size for the database is limited to 4GB.
527
+
528
+ ## Reference Implementations
529
+
530
+ ### Writer
531
+
532
+ * [Perl](https://github.com/maxmind/MaxMind-DB-Writer-perl)
533
+
534
+ ### Reader
535
+
536
+ * [C](https://github.com/maxmind/libmaxminddb)
537
+ * [C#](https://github.com/maxmind/MaxMind-DB-Reader-dotnet)
538
+ * [Java](https://github.com/maxmind/MaxMind-DB-Reader-java)
539
+ * [Perl](https://github.com/maxmind/MaxMind-DB-Reader-perl)
540
+ * [PHP](https://github.com/maxmind/MaxMind-DB-Reader-php)
541
+ * [Python](https://github.com/maxmind/MaxMind-DB-Reader-python)
542
+
543
+ ## Authors
544
+
545
+ This specification was created by the following authors:
546
+
547
+ * Greg Oschwald \<goschwald@maxmind.com\>
548
+ * Dave Rolsky \<drolsky@maxmind.com\>
549
+ * Boris Zentner \<bzentner@maxmind.com\>
550
+
551
+ ## License
552
+
553
+ This work is licensed under the Creative Commons Attribution-ShareAlike 3.0
554
+ Unported License. To view a copy of this license, visit
555
+ [http://creativecommons.org/licenses/by-sa/3.0/](http://creativecommons.org/licenses/by-sa/3.0/)
556
+ or send a letter to Creative Commons, 444 Castro Street, Suite 900, Mountain
557
+ View, California, 94041, USA
558
+
@@ -0,0 +1,4 @@
1
+ MaxMind DB is a binary file format that stores data indexed by IP address
2
+ subnets (IPv4 or IPv6).
3
+
4
+ This repository contains the spec for that format.
@@ -0,0 +1,7 @@
1
+ These are corrupt databases that have been know to cause problems such as
2
+ segfaults or unhandled errors on one or more MaxMind DB reader
3
+ implementations. Implementations _should_ return an appropriate error
4
+ or raise an exception on these databases.
5
+
6
+ If you find a corrupt test-sized database that crashes a MMDB reader library,
7
+ please feel free to add it here by creating a pull request.
@@ -0,0 +1 @@
1
+ ���MaxMind.com�Kdescription�Ben�
@@ -0,0 +1 @@
1
+ Dmap2�Earray�Dmap3�Aa�Ab�Ac����MaxMind.com�[binary_format_major_version�[binary_format_minor_version�Kbuild_epochX�2|Mdatabase_type]MaxMind DB Nested Data StructuresKdescription
@@ -0,0 +1,12 @@
1
+ --blank-lines-before-packages=0
2
+ --iterations=2
3
+ --no-outdent-long-comments
4
+ -b
5
+ -bar
6
+ -boc
7
+ -ci=4
8
+ -i=4
9
+ -l=78
10
+ -nolq
11
+ -se
12
+ -wbb="% + - * / x != == >= <= =~ !~ < > | & >= < = **= += *= &= <<= &&= -= /= |= >>= ||= .= %= ^= x="
@@ -0,0 +1,32 @@
1
+ [
2
+ {
3
+ "::1.2.0.0/112" : {
4
+ "is_anonymous" : true,
5
+ "is_anonymous_vpn" : true
6
+ }
7
+ },
8
+ {
9
+ "::71.160.223.0/120" : {
10
+ "is_anonymous" : true,
11
+ "is_hosting_provider" : true
12
+ }
13
+ },
14
+ {
15
+ "::186.30.236.0/120" : {
16
+ "is_anonymous" : true,
17
+ "is_public_proxy" : true
18
+ }
19
+ },
20
+ {
21
+ "::65.0.0.0/109" : {
22
+ "is_anonymous" : true,
23
+ "is_tor_exit_node" : true
24
+ }
25
+ },
26
+ {
27
+ "abcd:1000::/112" : {
28
+ "is_anonymous" : true,
29
+ "is_public_proxy" : true
30
+ }
31
+ }
32
+ ]