@loaders.gl/parquet 4.2.0-alpha.4 → 4.2.0-alpha.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. package/dist/index.cjs +269 -616
  2. package/dist/index.cjs.map +7 -0
  3. package/dist/index.d.ts +12 -12
  4. package/dist/index.d.ts.map +1 -1
  5. package/dist/index.js +6 -1
  6. package/dist/lib/arrow/convert-columns-to-row-group.js +1 -2
  7. package/dist/lib/arrow/convert-row-group-to-columns.js +8 -6
  8. package/dist/lib/arrow/convert-schema-from-parquet.d.ts +3 -3
  9. package/dist/lib/arrow/convert-schema-from-parquet.d.ts.map +1 -1
  10. package/dist/lib/arrow/convert-schema-from-parquet.js +71 -82
  11. package/dist/lib/arrow/convert-schema-to-parquet.d.ts +1 -1
  12. package/dist/lib/arrow/convert-schema-to-parquet.d.ts.map +1 -1
  13. package/dist/lib/arrow/convert-schema-to-parquet.js +65 -35
  14. package/dist/lib/constants.js +14 -1
  15. package/dist/lib/parsers/get-parquet-schema.d.ts +1 -1
  16. package/dist/lib/parsers/get-parquet-schema.d.ts.map +1 -1
  17. package/dist/lib/parsers/get-parquet-schema.js +9 -7
  18. package/dist/lib/parsers/parse-geoparquet.d.ts +1 -1
  19. package/dist/lib/parsers/parse-geoparquet.d.ts.map +1 -1
  20. package/dist/lib/parsers/parse-geoparquet.js +47 -45
  21. package/dist/lib/parsers/parse-parquet-to-columns.d.ts +1 -1
  22. package/dist/lib/parsers/parse-parquet-to-columns.d.ts.map +1 -1
  23. package/dist/lib/parsers/parse-parquet-to-columns.js +36 -25
  24. package/dist/lib/parsers/parse-parquet.d.ts +1 -1
  25. package/dist/lib/parsers/parse-parquet.d.ts.map +1 -1
  26. package/dist/lib/parsers/parse-parquet.js +62 -50
  27. package/dist/lib/wasm/encode-parquet-wasm.d.ts +1 -1
  28. package/dist/lib/wasm/encode-parquet-wasm.d.ts.map +1 -1
  29. package/dist/lib/wasm/encode-parquet-wasm.js +49 -12
  30. package/dist/lib/wasm/load-wasm.js +13 -10
  31. package/dist/lib/wasm/parse-parquet-wasm.d.ts +1 -1
  32. package/dist/lib/wasm/parse-parquet-wasm.d.ts.map +1 -1
  33. package/dist/lib/wasm/parse-parquet-wasm.js +19 -15
  34. package/dist/parquet-loader.d.ts +1 -1
  35. package/dist/parquet-loader.d.ts.map +1 -1
  36. package/dist/parquet-loader.js +77 -64
  37. package/dist/parquet-wasm-loader.js +25 -28
  38. package/dist/parquet-wasm-writer.js +19 -23
  39. package/dist/parquet-writer.js +16 -12
  40. package/dist/parquetjs/codecs/declare.d.ts +1 -1
  41. package/dist/parquetjs/codecs/declare.d.ts.map +1 -1
  42. package/dist/parquetjs/codecs/declare.js +0 -1
  43. package/dist/parquetjs/codecs/dictionary.js +4 -8
  44. package/dist/parquetjs/codecs/index.d.ts +3 -3
  45. package/dist/parquetjs/codecs/index.d.ts.map +1 -1
  46. package/dist/parquetjs/codecs/index.js +20 -17
  47. package/dist/parquetjs/codecs/plain.d.ts +2 -2
  48. package/dist/parquetjs/codecs/plain.d.ts.map +1 -1
  49. package/dist/parquetjs/codecs/plain.js +166 -162
  50. package/dist/parquetjs/codecs/rle.d.ts +2 -2
  51. package/dist/parquetjs/codecs/rle.d.ts.map +1 -1
  52. package/dist/parquetjs/codecs/rle.js +124 -105
  53. package/dist/parquetjs/compression.d.ts +1 -1
  54. package/dist/parquetjs/compression.d.ts.map +1 -1
  55. package/dist/parquetjs/compression.js +157 -43
  56. package/dist/parquetjs/encoder/parquet-encoder.d.ts +3 -3
  57. package/dist/parquetjs/encoder/parquet-encoder.d.ts.map +1 -1
  58. package/dist/parquetjs/encoder/parquet-encoder.js +406 -275
  59. package/dist/parquetjs/parquet-thrift/BoundaryOrder.js +12 -7
  60. package/dist/parquetjs/parquet-thrift/BsonType.js +31 -27
  61. package/dist/parquetjs/parquet-thrift/ColumnChunk.d.ts +1 -1
  62. package/dist/parquetjs/parquet-thrift/ColumnChunk.d.ts.map +1 -1
  63. package/dist/parquetjs/parquet-thrift/ColumnChunk.js +175 -166
  64. package/dist/parquetjs/parquet-thrift/ColumnIndex.d.ts +1 -1
  65. package/dist/parquetjs/parquet-thrift/ColumnIndex.d.ts.map +1 -1
  66. package/dist/parquetjs/parquet-thrift/ColumnIndex.js +178 -166
  67. package/dist/parquetjs/parquet-thrift/ColumnMetaData.d.ts +6 -6
  68. package/dist/parquetjs/parquet-thrift/ColumnMetaData.d.ts.map +1 -1
  69. package/dist/parquetjs/parquet-thrift/ColumnMetaData.js +359 -333
  70. package/dist/parquetjs/parquet-thrift/ColumnOrder.d.ts +1 -1
  71. package/dist/parquetjs/parquet-thrift/ColumnOrder.d.ts.map +1 -1
  72. package/dist/parquetjs/parquet-thrift/ColumnOrder.js +73 -68
  73. package/dist/parquetjs/parquet-thrift/CompressionCodec.js +17 -12
  74. package/dist/parquetjs/parquet-thrift/ConvertedType.js +31 -26
  75. package/dist/parquetjs/parquet-thrift/DataPageHeader.d.ts +2 -2
  76. package/dist/parquetjs/parquet-thrift/DataPageHeader.d.ts.map +1 -1
  77. package/dist/parquetjs/parquet-thrift/DataPageHeader.js +136 -124
  78. package/dist/parquetjs/parquet-thrift/DataPageHeaderV2.d.ts +2 -2
  79. package/dist/parquetjs/parquet-thrift/DataPageHeaderV2.d.ts.map +1 -1
  80. package/dist/parquetjs/parquet-thrift/DataPageHeaderV2.js +198 -181
  81. package/dist/parquetjs/parquet-thrift/DateType.js +31 -27
  82. package/dist/parquetjs/parquet-thrift/DecimalType.js +76 -69
  83. package/dist/parquetjs/parquet-thrift/DictionaryPageHeader.d.ts +1 -1
  84. package/dist/parquetjs/parquet-thrift/DictionaryPageHeader.d.ts.map +1 -1
  85. package/dist/parquetjs/parquet-thrift/DictionaryPageHeader.js +93 -86
  86. package/dist/parquetjs/parquet-thrift/Encoding.js +17 -12
  87. package/dist/parquetjs/parquet-thrift/EnumType.js +31 -27
  88. package/dist/parquetjs/parquet-thrift/FieldRepetitionType.js +12 -7
  89. package/dist/parquetjs/parquet-thrift/FileMetaData.d.ts +4 -4
  90. package/dist/parquetjs/parquet-thrift/FileMetaData.d.ts.map +1 -1
  91. package/dist/parquetjs/parquet-thrift/FileMetaData.js +217 -205
  92. package/dist/parquetjs/parquet-thrift/IndexPageHeader.js +31 -27
  93. package/dist/parquetjs/parquet-thrift/IntType.js +76 -69
  94. package/dist/parquetjs/parquet-thrift/JsonType.js +31 -27
  95. package/dist/parquetjs/parquet-thrift/KeyValue.js +73 -67
  96. package/dist/parquetjs/parquet-thrift/ListType.js +31 -27
  97. package/dist/parquetjs/parquet-thrift/LogicalType.d.ts +13 -13
  98. package/dist/parquetjs/parquet-thrift/LogicalType.d.ts.map +1 -1
  99. package/dist/parquetjs/parquet-thrift/LogicalType.js +325 -344
  100. package/dist/parquetjs/parquet-thrift/MapType.js +31 -27
  101. package/dist/parquetjs/parquet-thrift/MicroSeconds.js +31 -27
  102. package/dist/parquetjs/parquet-thrift/MilliSeconds.js +31 -27
  103. package/dist/parquetjs/parquet-thrift/NullType.js +31 -27
  104. package/dist/parquetjs/parquet-thrift/OffsetIndex.d.ts +1 -1
  105. package/dist/parquetjs/parquet-thrift/OffsetIndex.d.ts.map +1 -1
  106. package/dist/parquetjs/parquet-thrift/OffsetIndex.js +66 -60
  107. package/dist/parquetjs/parquet-thrift/PageEncodingStats.d.ts +2 -2
  108. package/dist/parquetjs/parquet-thrift/PageEncodingStats.d.ts.map +1 -1
  109. package/dist/parquetjs/parquet-thrift/PageEncodingStats.js +98 -88
  110. package/dist/parquetjs/parquet-thrift/PageHeader.d.ts +5 -5
  111. package/dist/parquetjs/parquet-thrift/PageHeader.d.ts.map +1 -1
  112. package/dist/parquetjs/parquet-thrift/PageHeader.js +185 -175
  113. package/dist/parquetjs/parquet-thrift/PageLocation.js +108 -96
  114. package/dist/parquetjs/parquet-thrift/PageType.js +13 -8
  115. package/dist/parquetjs/parquet-thrift/RowGroup.d.ts +2 -2
  116. package/dist/parquetjs/parquet-thrift/RowGroup.d.ts.map +1 -1
  117. package/dist/parquetjs/parquet-thrift/RowGroup.js +145 -133
  118. package/dist/parquetjs/parquet-thrift/SchemaElement.d.ts +4 -4
  119. package/dist/parquetjs/parquet-thrift/SchemaElement.d.ts.map +1 -1
  120. package/dist/parquetjs/parquet-thrift/SchemaElement.js +211 -205
  121. package/dist/parquetjs/parquet-thrift/SortingColumn.js +98 -88
  122. package/dist/parquetjs/parquet-thrift/Statistics.js +143 -137
  123. package/dist/parquetjs/parquet-thrift/StringType.js +31 -27
  124. package/dist/parquetjs/parquet-thrift/TimeType.d.ts +1 -1
  125. package/dist/parquetjs/parquet-thrift/TimeType.d.ts.map +1 -1
  126. package/dist/parquetjs/parquet-thrift/TimeType.js +76 -69
  127. package/dist/parquetjs/parquet-thrift/TimeUnit.d.ts +2 -2
  128. package/dist/parquetjs/parquet-thrift/TimeUnit.d.ts.map +1 -1
  129. package/dist/parquetjs/parquet-thrift/TimeUnit.js +94 -91
  130. package/dist/parquetjs/parquet-thrift/TimestampType.d.ts +1 -1
  131. package/dist/parquetjs/parquet-thrift/TimestampType.d.ts.map +1 -1
  132. package/dist/parquetjs/parquet-thrift/TimestampType.js +76 -69
  133. package/dist/parquetjs/parquet-thrift/Type.js +17 -12
  134. package/dist/parquetjs/parquet-thrift/TypeDefinedOrder.js +31 -27
  135. package/dist/parquetjs/parquet-thrift/UUIDType.js +31 -27
  136. package/dist/parquetjs/parquet-thrift/index.d.ts +44 -44
  137. package/dist/parquetjs/parquet-thrift/index.d.ts.map +1 -1
  138. package/dist/parquetjs/parquet-thrift/index.js +6 -1
  139. package/dist/parquetjs/parser/decoders.d.ts +3 -3
  140. package/dist/parquetjs/parser/decoders.d.ts.map +1 -1
  141. package/dist/parquetjs/parser/decoders.js +306 -242
  142. package/dist/parquetjs/parser/parquet-reader.d.ts +3 -3
  143. package/dist/parquetjs/parser/parquet-reader.d.ts.map +1 -1
  144. package/dist/parquetjs/parser/parquet-reader.js +179 -157
  145. package/dist/parquetjs/schema/declare.d.ts +1 -1
  146. package/dist/parquetjs/schema/declare.d.ts.map +1 -1
  147. package/dist/parquetjs/schema/declare.js +6 -9
  148. package/dist/parquetjs/schema/schema.d.ts +1 -1
  149. package/dist/parquetjs/schema/schema.d.ts.map +1 -1
  150. package/dist/parquetjs/schema/schema.js +139 -120
  151. package/dist/parquetjs/schema/shred.d.ts +2 -2
  152. package/dist/parquetjs/schema/shred.d.ts.map +1 -1
  153. package/dist/parquetjs/schema/shred.js +301 -204
  154. package/dist/parquetjs/schema/types.d.ts +1 -1
  155. package/dist/parquetjs/schema/types.d.ts.map +1 -1
  156. package/dist/parquetjs/schema/types.js +328 -314
  157. package/dist/parquetjs/utils/file-utils.js +24 -23
  158. package/dist/parquetjs/utils/read-utils.d.ts +1 -1
  159. package/dist/parquetjs/utils/read-utils.d.ts.map +1 -1
  160. package/dist/parquetjs/utils/read-utils.js +79 -69
  161. package/dist/polyfills/buffer/buffer-polyfill.browser.js +8 -3
  162. package/dist/polyfills/buffer/buffer-polyfill.node.js +13 -8
  163. package/dist/polyfills/buffer/buffer.js +1750 -1367
  164. package/dist/polyfills/buffer/index.d.ts +3 -3
  165. package/dist/polyfills/buffer/index.d.ts.map +1 -1
  166. package/dist/polyfills/buffer/index.js +5 -1
  167. package/dist/polyfills/buffer/install-buffer-polyfill.js +2 -1
  168. package/dist/polyfills/util.js +4 -1
  169. package/dist/workers/parquet-worker.js +3 -1
  170. package/package.json +15 -13
  171. package/src/index.ts +1 -1
  172. package/src/lib/parsers/parse-parquet-to-columns.ts +1 -1
  173. package/src/lib/parsers/parse-parquet.ts +1 -1
  174. package/src/parquetjs/encoder/parquet-encoder.ts +2 -2
  175. package/src/parquetjs/parser/decoders.ts +2 -2
  176. package/src/parquetjs/parser/parquet-reader.ts +1 -1
  177. package/src/parquetjs/schema/schema.ts +1 -1
  178. package/src/parquetjs/utils/read-utils.ts +2 -2
  179. package/dist/index.js.map +0 -1
  180. package/dist/lib/arrow/convert-columns-to-row-group.js.map +0 -1
  181. package/dist/lib/arrow/convert-row-group-to-columns.js.map +0 -1
  182. package/dist/lib/arrow/convert-schema-from-parquet.js.map +0 -1
  183. package/dist/lib/arrow/convert-schema-to-parquet.js.map +0 -1
  184. package/dist/lib/constants.js.map +0 -1
  185. package/dist/lib/parsers/get-parquet-schema.js.map +0 -1
  186. package/dist/lib/parsers/parse-geoparquet.js.map +0 -1
  187. package/dist/lib/parsers/parse-parquet-to-columns.js.map +0 -1
  188. package/dist/lib/parsers/parse-parquet.js.map +0 -1
  189. package/dist/lib/wasm/encode-parquet-wasm.js.map +0 -1
  190. package/dist/lib/wasm/load-wasm.js.map +0 -1
  191. package/dist/lib/wasm/parse-parquet-wasm.js.map +0 -1
  192. package/dist/lib/wip/convert-schema-deep.java.disabled +0 -910
  193. package/dist/lib/wip/convert-schema-deep.rs.disabled +0 -976
  194. package/dist/parquet-loader.js.map +0 -1
  195. package/dist/parquet-wasm-loader.js.map +0 -1
  196. package/dist/parquet-wasm-writer.js.map +0 -1
  197. package/dist/parquet-writer.js.map +0 -1
  198. package/dist/parquetjs/LICENSE +0 -20
  199. package/dist/parquetjs/codecs/declare.js.map +0 -1
  200. package/dist/parquetjs/codecs/dictionary.js.map +0 -1
  201. package/dist/parquetjs/codecs/index.js.map +0 -1
  202. package/dist/parquetjs/codecs/plain.js.map +0 -1
  203. package/dist/parquetjs/codecs/rle.js.map +0 -1
  204. package/dist/parquetjs/compression.js.map +0 -1
  205. package/dist/parquetjs/encoder/parquet-encoder.js.map +0 -1
  206. package/dist/parquetjs/modules.d.ts +0 -21
  207. package/dist/parquetjs/parquet-thrift/BoundaryOrder.js.map +0 -1
  208. package/dist/parquetjs/parquet-thrift/BsonType.js.map +0 -1
  209. package/dist/parquetjs/parquet-thrift/ColumnChunk.js.map +0 -1
  210. package/dist/parquetjs/parquet-thrift/ColumnIndex.js.map +0 -1
  211. package/dist/parquetjs/parquet-thrift/ColumnMetaData.js.map +0 -1
  212. package/dist/parquetjs/parquet-thrift/ColumnOrder.js.map +0 -1
  213. package/dist/parquetjs/parquet-thrift/CompressionCodec.js.map +0 -1
  214. package/dist/parquetjs/parquet-thrift/ConvertedType.js.map +0 -1
  215. package/dist/parquetjs/parquet-thrift/DataPageHeader.js.map +0 -1
  216. package/dist/parquetjs/parquet-thrift/DataPageHeaderV2.js.map +0 -1
  217. package/dist/parquetjs/parquet-thrift/DateType.js.map +0 -1
  218. package/dist/parquetjs/parquet-thrift/DecimalType.js.map +0 -1
  219. package/dist/parquetjs/parquet-thrift/DictionaryPageHeader.js.map +0 -1
  220. package/dist/parquetjs/parquet-thrift/Encoding.js.map +0 -1
  221. package/dist/parquetjs/parquet-thrift/EnumType.js.map +0 -1
  222. package/dist/parquetjs/parquet-thrift/FieldRepetitionType.js.map +0 -1
  223. package/dist/parquetjs/parquet-thrift/FileMetaData.js.map +0 -1
  224. package/dist/parquetjs/parquet-thrift/IndexPageHeader.js.map +0 -1
  225. package/dist/parquetjs/parquet-thrift/IntType.js.map +0 -1
  226. package/dist/parquetjs/parquet-thrift/JsonType.js.map +0 -1
  227. package/dist/parquetjs/parquet-thrift/KeyValue.js.map +0 -1
  228. package/dist/parquetjs/parquet-thrift/ListType.js.map +0 -1
  229. package/dist/parquetjs/parquet-thrift/LogicalType.js.map +0 -1
  230. package/dist/parquetjs/parquet-thrift/MapType.js.map +0 -1
  231. package/dist/parquetjs/parquet-thrift/MicroSeconds.js.map +0 -1
  232. package/dist/parquetjs/parquet-thrift/MilliSeconds.js.map +0 -1
  233. package/dist/parquetjs/parquet-thrift/NullType.js.map +0 -1
  234. package/dist/parquetjs/parquet-thrift/OffsetIndex.js.map +0 -1
  235. package/dist/parquetjs/parquet-thrift/PageEncodingStats.js.map +0 -1
  236. package/dist/parquetjs/parquet-thrift/PageHeader.js.map +0 -1
  237. package/dist/parquetjs/parquet-thrift/PageLocation.js.map +0 -1
  238. package/dist/parquetjs/parquet-thrift/PageType.js.map +0 -1
  239. package/dist/parquetjs/parquet-thrift/RowGroup.js.map +0 -1
  240. package/dist/parquetjs/parquet-thrift/SchemaElement.js.map +0 -1
  241. package/dist/parquetjs/parquet-thrift/SortingColumn.js.map +0 -1
  242. package/dist/parquetjs/parquet-thrift/Statistics.js.map +0 -1
  243. package/dist/parquetjs/parquet-thrift/StringType.js.map +0 -1
  244. package/dist/parquetjs/parquet-thrift/TimeType.js.map +0 -1
  245. package/dist/parquetjs/parquet-thrift/TimeUnit.js.map +0 -1
  246. package/dist/parquetjs/parquet-thrift/TimestampType.js.map +0 -1
  247. package/dist/parquetjs/parquet-thrift/Type.js.map +0 -1
  248. package/dist/parquetjs/parquet-thrift/TypeDefinedOrder.js.map +0 -1
  249. package/dist/parquetjs/parquet-thrift/UUIDType.js.map +0 -1
  250. package/dist/parquetjs/parquet-thrift/index.js.map +0 -1
  251. package/dist/parquetjs/parser/decoders.js.map +0 -1
  252. package/dist/parquetjs/parser/parquet-reader.js.map +0 -1
  253. package/dist/parquetjs/schema/declare.js.map +0 -1
  254. package/dist/parquetjs/schema/schema.js.map +0 -1
  255. package/dist/parquetjs/schema/shred.js.map +0 -1
  256. package/dist/parquetjs/schema/types.js.map +0 -1
  257. package/dist/parquetjs/utils/file-utils.js.map +0 -1
  258. package/dist/parquetjs/utils/read-utils.js.map +0 -1
  259. package/dist/polyfills/buffer/buffer-polyfill.browser.js.map +0 -1
  260. package/dist/polyfills/buffer/buffer-polyfill.node.js.map +0 -1
  261. package/dist/polyfills/buffer/buffer.js.map +0 -1
  262. package/dist/polyfills/buffer/index.js.map +0 -1
  263. package/dist/polyfills/buffer/install-buffer-polyfill.js.map +0 -1
  264. package/dist/polyfills/util.js.map +0 -1
  265. package/dist/workers/parquet-worker.js.map +0 -1
@@ -1,976 +0,0 @@
1
- //! This module has a single entry point, [`parquet_to_arrow_schema`].
2
- use parquet2::schema::{
3
- types::{
4
- FieldInfo, GroupConvertedType, GroupLogicalType, IntegerType, ParquetType, PhysicalType,
5
- PrimitiveConvertedType, PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit,
6
- },
7
- Repetition,
8
- };
9
-
10
- use crate::datatypes::{DataType, Field, IntervalUnit, TimeUnit};
11
-
12
- /// Converts [`ParquetType`]s to a [`Field`], ignoring parquet fields that do not contain
13
- /// any physical column.
14
- pub fn parquet_to_arrow_schema(fields: &[ParquetType]) -> Vec<Field> {
15
- fields.iter().filter_map(to_field).collect::<Vec<_>>()
16
- }
17
-
18
- fn from_int32(
19
- logical_type: Option<PrimitiveLogicalType>,
20
- converted_type: Option<PrimitiveConvertedType>,
21
- ) -> DataType {
22
- use PrimitiveLogicalType::*;
23
- match (logical_type, converted_type) {
24
- // handle logical types first
25
- (Some(Integer(t)), _) => match t {
26
- IntegerType::Int8 => DataType::Int8,
27
- IntegerType::Int16 => DataType::Int16,
28
- IntegerType::Int32 => DataType::Int32,
29
- IntegerType::UInt8 => DataType::UInt8,
30
- IntegerType::UInt16 => DataType::UInt16,
31
- IntegerType::UInt32 => DataType::UInt32,
32
- // The above are the only possible annotations for parquet's int32. Anything else
33
- // is a deviation to the parquet specification and we ignore
34
- _ => DataType::Int32,
35
- },
36
- (Some(Decimal(precision, scale)), _) => DataType::Decimal(precision, scale),
37
- (Some(Date), _) => DataType::Date32,
38
- (Some(Time { unit, .. }), _) => match unit {
39
- ParquetTimeUnit::Milliseconds => DataType::Time32(TimeUnit::Millisecond),
40
- // MILLIS is the only possible annotation for parquet's int32. Anything else
41
- // is a deviation to the parquet specification and we ignore
42
- _ => DataType::Int32,
43
- },
44
- // handle converted types:
45
- (_, Some(PrimitiveConvertedType::Uint8)) => DataType::UInt8,
46
- (_, Some(PrimitiveConvertedType::Uint16)) => DataType::UInt16,
47
- (_, Some(PrimitiveConvertedType::Uint32)) => DataType::UInt32,
48
- (_, Some(PrimitiveConvertedType::Int8)) => DataType::Int8,
49
- (_, Some(PrimitiveConvertedType::Int16)) => DataType::Int16,
50
- (_, Some(PrimitiveConvertedType::Int32)) => DataType::Int32,
51
- (_, Some(PrimitiveConvertedType::Date)) => DataType::Date32,
52
- (_, Some(PrimitiveConvertedType::TimeMillis)) => DataType::Time32(TimeUnit::Millisecond),
53
- (_, Some(PrimitiveConvertedType::Decimal(precision, scale))) => {
54
- DataType::Decimal(precision, scale)
55
- }
56
- (_, _) => DataType::Int32,
57
- }
58
- }
59
-
60
- fn from_int64(
61
- logical_type: Option<PrimitiveLogicalType>,
62
- converted_type: Option<PrimitiveConvertedType>,
63
- ) -> DataType {
64
- use PrimitiveLogicalType::*;
65
- match (logical_type, converted_type) {
66
- // handle logical types first
67
- (Some(Integer(integer)), _) => match integer {
68
- IntegerType::UInt64 => DataType::UInt64,
69
- IntegerType::Int64 => DataType::Int64,
70
- _ => DataType::Int64,
71
- },
72
- (
73
- Some(Timestamp {
74
- is_adjusted_to_utc,
75
- unit,
76
- }),
77
- _,
78
- ) => {
79
- let timezone = if is_adjusted_to_utc {
80
- // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
81
- // A TIMESTAMP with isAdjustedToUTC=true is defined as [...] elapsed since the Unix epoch
82
- Some("+00:00".to_string())
83
- } else {
84
- // PARQUET:
85
- // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
86
- // A TIMESTAMP with isAdjustedToUTC=false represents [...] such
87
- // timestamps should always be displayed the same way, regardless of the local time zone in effect
88
- // ARROW:
89
- // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
90
- // If the time zone is null or equal to an empty string, the data is "time
91
- // zone naive" and shall be displayed *as is* to the user, not localized
92
- // to the locale of the user.
93
- None
94
- };
95
-
96
- match unit {
97
- ParquetTimeUnit::Milliseconds => {
98
- DataType::Timestamp(TimeUnit::Millisecond, timezone)
99
- }
100
- ParquetTimeUnit::Microseconds => {
101
- DataType::Timestamp(TimeUnit::Microsecond, timezone)
102
- }
103
- ParquetTimeUnit::Nanoseconds => DataType::Timestamp(TimeUnit::Nanosecond, timezone),
104
- }
105
- }
106
- (Some(Time { unit, .. }), _) => match unit {
107
- ParquetTimeUnit::Microseconds => DataType::Time64(TimeUnit::Microsecond),
108
- ParquetTimeUnit::Nanoseconds => DataType::Time64(TimeUnit::Nanosecond),
109
- // MILLIS is only possible for int32. Appearing in int64 is a deviation
110
- // to parquet's spec, which we ignore
111
- _ => DataType::Int64,
112
- },
113
- (Some(Decimal(precision, scale)), _) => DataType::Decimal(precision, scale),
114
- // handle converted types:
115
- (_, Some(PrimitiveConvertedType::TimeMicros)) => DataType::Time64(TimeUnit::Microsecond),
116
- (_, Some(PrimitiveConvertedType::TimestampMillis)) => {
117
- DataType::Timestamp(TimeUnit::Millisecond, None)
118
- }
119
- (_, Some(PrimitiveConvertedType::TimestampMicros)) => {
120
- DataType::Timestamp(TimeUnit::Microsecond, None)
121
- }
122
- (_, Some(PrimitiveConvertedType::Int64)) => DataType::Int64,
123
- (_, Some(PrimitiveConvertedType::Uint64)) => DataType::UInt64,
124
- (_, Some(PrimitiveConvertedType::Decimal(precision, scale))) => {
125
- DataType::Decimal(precision, scale)
126
- }
127
-
128
- (_, _) => DataType::Int64,
129
- }
130
- }
131
-
132
- fn from_byte_array(
133
- logical_type: &Option<PrimitiveLogicalType>,
134
- converted_type: &Option<PrimitiveConvertedType>,
135
- ) -> DataType {
136
- match (logical_type, converted_type) {
137
- (Some(PrimitiveLogicalType::String), _) => DataType::Utf8,
138
- (Some(PrimitiveLogicalType::Json), _) => DataType::Binary,
139
- (Some(PrimitiveLogicalType::Bson), _) => DataType::Binary,
140
- (Some(PrimitiveLogicalType::Enum), _) => DataType::Binary,
141
- (_, Some(PrimitiveConvertedType::Json)) => DataType::Binary,
142
- (_, Some(PrimitiveConvertedType::Bson)) => DataType::Binary,
143
- (_, Some(PrimitiveConvertedType::Enum)) => DataType::Binary,
144
- (_, Some(PrimitiveConvertedType::Utf8)) => DataType::Utf8,
145
- (_, _) => DataType::Binary,
146
- }
147
- }
148
-
149
- fn from_fixed_len_byte_array(
150
- length: usize,
151
- logical_type: Option<PrimitiveLogicalType>,
152
- converted_type: Option<PrimitiveConvertedType>,
153
- ) -> DataType {
154
- match (logical_type, converted_type) {
155
- (Some(PrimitiveLogicalType::Decimal(precision, scale)), _) => {
156
- DataType::Decimal(precision, scale)
157
- }
158
- (None, Some(PrimitiveConvertedType::Decimal(precision, scale))) => {
159
- DataType::Decimal(precision, scale)
160
- }
161
- (None, Some(PrimitiveConvertedType::Interval)) => {
162
- // There is currently no reliable way of determining which IntervalUnit
163
- // to return. Thus without the original Arrow schema, the results
164
- // would be incorrect if all 12 bytes of the interval are populated
165
- DataType::Interval(IntervalUnit::DayTime)
166
- }
167
- _ => DataType::FixedSizeBinary(length),
168
- }
169
- }
170
-
171
- /// Maps a [`PhysicalType`] with optional metadata to a [`DataType`]
172
- fn to_primitive_type_inner(primitive_type: &PrimitiveType) -> DataType {
173
- match primitive_type.physical_type {
174
- PhysicalType::Boolean => DataType::Boolean,
175
- PhysicalType::Int32 => {
176
- from_int32(primitive_type.logical_type, primitive_type.converted_type)
177
- }
178
- PhysicalType::Int64 => {
179
- from_int64(primitive_type.logical_type, primitive_type.converted_type)
180
- }
181
- PhysicalType::Int96 => DataType::Timestamp(TimeUnit::Nanosecond, None),
182
- PhysicalType::Float => DataType::Float32,
183
- PhysicalType::Double => DataType::Float64,
184
- PhysicalType::ByteArray => {
185
- from_byte_array(&primitive_type.logical_type, &primitive_type.converted_type)
186
- }
187
- PhysicalType::FixedLenByteArray(length) => from_fixed_len_byte_array(
188
- length,
189
- primitive_type.logical_type,
190
- primitive_type.converted_type,
191
- ),
192
- }
193
- }
194
-
195
- /// Entry point for converting parquet primitive type to arrow type.
196
- ///
197
- /// This function takes care of repetition.
198
- fn to_primitive_type(primitive_type: &PrimitiveType) -> DataType {
199
- let base_type = to_primitive_type_inner(primitive_type);
200
-
201
- if primitive_type.field_info.repetition == Repetition::Repeated {
202
- DataType::List(Box::new(Field::new(
203
- &primitive_type.field_info.name,
204
- base_type,
205
- is_nullable(&primitive_type.field_info),
206
- )))
207
- } else {
208
- base_type
209
- }
210
- }
211
-
212
- fn non_repeated_group(
213
- logical_type: &Option<GroupLogicalType>,
214
- converted_type: &Option<GroupConvertedType>,
215
- fields: &[ParquetType],
216
- parent_name: &str,
217
- ) -> Option<DataType> {
218
- debug_assert!(!fields.is_empty());
219
- match (logical_type, converted_type) {
220
- (Some(GroupLogicalType::List), _) => to_list(fields, parent_name),
221
- (None, Some(GroupConvertedType::List)) => to_list(fields, parent_name),
222
- (Some(GroupLogicalType::Map), _) => to_list(fields, parent_name),
223
- (None, Some(GroupConvertedType::Map) | Some(GroupConvertedType::MapKeyValue)) => {
224
- to_map(fields)
225
- }
226
- _ => to_struct(fields),
227
- }
228
- }
229
-
230
- /// Converts a parquet group type to an arrow [`DataType::Struct`].
231
- /// Returns [`None`] if all its fields are empty
232
- fn to_struct(fields: &[ParquetType]) -> Option<DataType> {
233
- let fields = fields.iter().filter_map(to_field).collect::<Vec<Field>>();
234
- if fields.is_empty() {
235
- None
236
- } else {
237
- Some(DataType::Struct(fields))
238
- }
239
- }
240
-
241
- /// Converts a parquet group type to an arrow [`DataType::Struct`].
242
- /// Returns [`None`] if all its fields are empty
243
- fn to_map(fields: &[ParquetType]) -> Option<DataType> {
244
- let inner = to_field(&fields[0])?;
245
- Some(DataType::Map(Box::new(inner), false))
246
- }
247
-
248
- /// Entry point for converting parquet group type.
249
- ///
250
- /// This function takes care of logical type and repetition.
251
- fn to_group_type(
252
- field_info: &FieldInfo,
253
- logical_type: &Option<GroupLogicalType>,
254
- converted_type: &Option<GroupConvertedType>,
255
- fields: &[ParquetType],
256
- parent_name: &str,
257
- ) -> Option<DataType> {
258
- debug_assert!(!fields.is_empty());
259
- if field_info.repetition == Repetition::Repeated {
260
- Some(DataType::List(Box::new(Field::new(
261
- &field_info.name,
262
- to_struct(fields)?,
263
- is_nullable(field_info),
264
- ))))
265
- } else {
266
- non_repeated_group(logical_type, converted_type, fields, parent_name)
267
- }
268
- }
269
-
270
- /// Checks whether this schema is nullable.
271
- pub(crate) fn is_nullable(field_info: &FieldInfo) -> bool {
272
- match field_info.repetition {
273
- Repetition::Optional => true,
274
- Repetition::Repeated => true,
275
- Repetition::Required => false,
276
- }
277
- }
278
-
279
- /// Converts parquet schema to arrow field.
280
- /// Returns `None` iff the parquet type has no associated primitive types,
281
- /// i.e. if it is a column-less group type.
282
- fn to_field(type_: &ParquetType) -> Option<Field> {
283
- Some(Field::new(
284
- &type_.get_field_info().name,
285
- to_data_type(type_)?,
286
- is_nullable(type_.get_field_info()),
287
- ))
288
- }
289
-
290
- /// Converts a parquet list to arrow list.
291
- ///
292
- /// To fully understand this algorithm, please refer to
293
- /// [parquet doc](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md).
294
- fn to_list(fields: &[ParquetType], parent_name: &str) -> Option<DataType> {
295
- let item = fields.first().unwrap();
296
-
297
- let item_type = match item {
298
- ParquetType::PrimitiveType(primitive) => Some(to_primitive_type_inner(primitive)),
299
- ParquetType::GroupType { fields, .. } => {
300
- if fields.len() == 1
301
- && item.name() != "array"
302
- && item.name() != format!("{parent_name}_tuple")
303
- {
304
- // extract the repetition field
305
- let nested_item = fields.first().unwrap();
306
- to_data_type(nested_item)
307
- } else {
308
- to_struct(fields)
309
- }
310
- }
311
- }?;
312
-
313
- // Check that the name of the list child is "list", in which case we
314
- // get the child nullability and name (normally "element") from the nested
315
- // group type.
316
- // Without this step, the child incorrectly inherits the parent's optionality
317
- let (list_item_name, item_is_optional) = match item {
318
- ParquetType::GroupType {
319
- field_info, fields, ..
320
- } if field_info.name == "list" && fields.len() == 1 => {
321
- let field = fields.first().unwrap();
322
- (
323
- &field.get_field_info().name,
324
- field.get_field_info().repetition != Repetition::Required,
325
- )
326
- }
327
- _ => (
328
- &item.get_field_info().name,
329
- item.get_field_info().repetition != Repetition::Required,
330
- ),
331
- };
332
-
333
- Some(DataType::List(Box::new(Field::new(
334
- list_item_name,
335
- item_type,
336
- item_is_optional,
337
- ))))
338
- }
339
-
340
- /// Converts parquet schema to arrow data type.
341
- ///
342
- /// This function discards schema name.
343
- ///
344
- /// If this schema is a primitive type and not included in the leaves, the result is
345
- /// Ok(None).
346
- ///
347
- /// If this schema is a group type and none of its children is reserved in the
348
- /// conversion, the result is Ok(None).
349
- pub(crate) fn to_data_type(type_: &ParquetType) -> Option<DataType> {
350
- match type_ {
351
- ParquetType::PrimitiveType(primitive) => Some(to_primitive_type(primitive)),
352
- ParquetType::GroupType {
353
- field_info,
354
- logical_type,
355
- converted_type,
356
- fields,
357
- } => {
358
- if fields.is_empty() {
359
- None
360
- } else {
361
- to_group_type(
362
- field_info,
363
- logical_type,
364
- converted_type,
365
- fields,
366
- &field_info.name,
367
- )
368
- }
369
- }
370
- }
371
- }
372
-
373
- #[cfg(test)]
374
- mod tests {
375
- use parquet2::metadata::SchemaDescriptor;
376
-
377
- use super::*;
378
-
379
- use crate::datatypes::{DataType, Field, TimeUnit};
380
- use crate::error::Result;
381
-
382
- #[test]
383
- fn test_flat_primitives() -> Result<()> {
384
- let message = "
385
- message test_schema {
386
- REQUIRED BOOLEAN boolean;
387
- REQUIRED INT32 int8 (INT_8);
388
- REQUIRED INT32 int16 (INT_16);
389
- REQUIRED INT32 uint8 (INTEGER(8,false));
390
- REQUIRED INT32 uint16 (INTEGER(16,false));
391
- REQUIRED INT32 int32;
392
- REQUIRED INT64 int64 ;
393
- OPTIONAL DOUBLE double;
394
- OPTIONAL FLOAT float;
395
- OPTIONAL BINARY string (UTF8);
396
- OPTIONAL BINARY string_2 (STRING);
397
- }
398
- ";
399
- let expected = &[
400
- Field::new("boolean", DataType::Boolean, false),
401
- Field::new("int8", DataType::Int8, false),
402
- Field::new("int16", DataType::Int16, false),
403
- Field::new("uint8", DataType::UInt8, false),
404
- Field::new("uint16", DataType::UInt16, false),
405
- Field::new("int32", DataType::Int32, false),
406
- Field::new("int64", DataType::Int64, false),
407
- Field::new("double", DataType::Float64, true),
408
- Field::new("float", DataType::Float32, true),
409
- Field::new("string", DataType::Utf8, true),
410
- Field::new("string_2", DataType::Utf8, true),
411
- ];
412
-
413
- let parquet_schema = SchemaDescriptor::try_from_message(message)?;
414
- let fields = parquet_to_arrow_schema(parquet_schema.fields());
415
-
416
- assert_eq!(fields, expected);
417
- Ok(())
418
- }
419
-
420
- #[test]
421
- fn test_byte_array_fields() -> Result<()> {
422
- let message = "
423
- message test_schema {
424
- REQUIRED BYTE_ARRAY binary;
425
- REQUIRED FIXED_LEN_BYTE_ARRAY (20) fixed_binary;
426
- }
427
- ";
428
- let expected = vec![
429
- Field::new("binary", DataType::Binary, false),
430
- Field::new("fixed_binary", DataType::FixedSizeBinary(20), false),
431
- ];
432
-
433
- let parquet_schema = SchemaDescriptor::try_from_message(message)?;
434
- let fields = parquet_to_arrow_schema(parquet_schema.fields());
435
-
436
- assert_eq!(fields, expected);
437
- Ok(())
438
- }
439
-
440
- #[test]
441
- fn test_duplicate_fields() -> Result<()> {
442
- let message = "
443
- message test_schema {
444
- REQUIRED BOOLEAN boolean;
445
- REQUIRED INT32 int8 (INT_8);
446
- }
447
- ";
448
- let expected = &[
449
- Field::new("boolean", DataType::Boolean, false),
450
- Field::new("int8", DataType::Int8, false),
451
- ];
452
-
453
- let parquet_schema = SchemaDescriptor::try_from_message(message)?;
454
- let fields = parquet_to_arrow_schema(parquet_schema.fields());
455
-
456
- assert_eq!(fields, expected);
457
- Ok(())
458
- }
459
-
460
- #[test]
461
- fn test_parquet_lists() -> Result<()> {
462
- let mut arrow_fields = Vec::new();
463
-
464
- // LIST encoding example taken from parquet-format/LogicalTypes.md
465
- let message_type = "
466
- message test_schema {
467
- REQUIRED GROUP my_list (LIST) {
468
- REPEATED GROUP list {
469
- OPTIONAL BINARY element (UTF8);
470
- }
471
- }
472
- OPTIONAL GROUP my_list (LIST) {
473
- REPEATED GROUP list {
474
- REQUIRED BINARY element (UTF8);
475
- }
476
- }
477
- OPTIONAL GROUP array_of_arrays (LIST) {
478
- REPEATED GROUP list {
479
- REQUIRED GROUP element (LIST) {
480
- REPEATED GROUP list {
481
- REQUIRED INT32 element;
482
- }
483
- }
484
- }
485
- }
486
- OPTIONAL GROUP my_list (LIST) {
487
- REPEATED GROUP element {
488
- REQUIRED BINARY str (UTF8);
489
- }
490
- }
491
- OPTIONAL GROUP my_list (LIST) {
492
- REPEATED INT32 element;
493
- }
494
- OPTIONAL GROUP my_list (LIST) {
495
- REPEATED GROUP element {
496
- REQUIRED BINARY str (UTF8);
497
- REQUIRED INT32 num;
498
- }
499
- }
500
- OPTIONAL GROUP my_list (LIST) {
501
- REPEATED GROUP array {
502
- REQUIRED BINARY str (UTF8);
503
- }
504
-
505
- }
506
- OPTIONAL GROUP my_list (LIST) {
507
- REPEATED GROUP my_list_tuple {
508
- REQUIRED BINARY str (UTF8);
509
- }
510
- }
511
- REPEATED INT32 name;
512
- }
513
- ";
514
-
515
- // // List<String> (list non-null, elements nullable)
516
- // required group my_list (LIST) {
517
- // repeated group list {
518
- // optional binary element (UTF8);
519
- // }
520
- // }
521
- {
522
- arrow_fields.push(Field::new(
523
- "my_list",
524
- DataType::List(Box::new(Field::new("element", DataType::Utf8, true))),
525
- false,
526
- ));
527
- }
528
-
529
- // // List<String> (list nullable, elements non-null)
530
- // optional group my_list (LIST) {
531
- // repeated group list {
532
- // required binary element (UTF8);
533
- // }
534
- // }
535
- {
536
- arrow_fields.push(Field::new(
537
- "my_list",
538
- DataType::List(Box::new(Field::new("element", DataType::Utf8, false))),
539
- true,
540
- ));
541
- }
542
-
543
- // Element types can be nested structures. For example, a list of lists:
544
- //
545
- // // List<List<Integer>>
546
- // optional group array_of_arrays (LIST) {
547
- // repeated group list {
548
- // required group element (LIST) {
549
- // repeated group list {
550
- // required int32 element;
551
- // }
552
- // }
553
- // }
554
- // }
555
- {
556
- let arrow_inner_list =
557
- DataType::List(Box::new(Field::new("element", DataType::Int32, false)));
558
- arrow_fields.push(Field::new(
559
- "array_of_arrays",
560
- DataType::List(Box::new(Field::new("element", arrow_inner_list, false))),
561
- true,
562
- ));
563
- }
564
-
565
- // // List<String> (list nullable, elements non-null)
566
- // optional group my_list (LIST) {
567
- // repeated group element {
568
- // required binary str (UTF8);
569
- // };
570
- // }
571
- {
572
- arrow_fields.push(Field::new(
573
- "my_list",
574
- DataType::List(Box::new(Field::new("element", DataType::Utf8, true))),
575
- true,
576
- ));
577
- }
578
-
579
- // // List<Integer> (nullable list, non-null elements)
580
- // optional group my_list (LIST) {
581
- // repeated int32 element;
582
- // }
583
- {
584
- arrow_fields.push(Field::new(
585
- "my_list",
586
- DataType::List(Box::new(Field::new("element", DataType::Int32, true))),
587
- true,
588
- ));
589
- }
590
-
591
- // // List<Tuple<String, Integer>> (nullable list, non-null elements)
592
- // optional group my_list (LIST) {
593
- // repeated group element {
594
- // required binary str (UTF8);
595
- // required int32 num;
596
- // };
597
- // }
598
- {
599
- let arrow_struct = DataType::Struct(vec![
600
- Field::new("str", DataType::Utf8, false),
601
- Field::new("num", DataType::Int32, false),
602
- ]);
603
- arrow_fields.push(Field::new(
604
- "my_list",
605
- DataType::List(Box::new(Field::new("element", arrow_struct, true))),
606
- true,
607
- ));
608
- }
609
-
610
- // // List<OneTuple<String>> (nullable list, non-null elements)
611
- // optional group my_list (LIST) {
612
- // repeated group array {
613
- // required binary str (UTF8);
614
- // };
615
- // }
616
- // Special case: group is named array
617
- {
618
- let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]);
619
- arrow_fields.push(Field::new(
620
- "my_list",
621
- DataType::List(Box::new(Field::new("array", arrow_struct, true))),
622
- true,
623
- ));
624
- }
625
-
626
- // // List<OneTuple<String>> (nullable list, non-null elements)
627
- // optional group my_list (LIST) {
628
- // repeated group my_list_tuple {
629
- // required binary str (UTF8);
630
- // };
631
- // }
632
- // Special case: group named ends in _tuple
633
- {
634
- let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]);
635
- arrow_fields.push(Field::new(
636
- "my_list",
637
- DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, true))),
638
- true,
639
- ));
640
- }
641
-
642
- // One-level encoding: Only allows required lists with required cells
643
- // repeated value_type name
644
- {
645
- arrow_fields.push(Field::new(
646
- "name",
647
- DataType::List(Box::new(Field::new("name", DataType::Int32, true))),
648
- true,
649
- ));
650
- }
651
-
652
- let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;
653
- let fields = parquet_to_arrow_schema(parquet_schema.fields());
654
-
655
- assert_eq!(arrow_fields, fields);
656
- Ok(())
657
- }
658
-
659
- #[test]
660
- fn test_parquet_list_nullable() -> Result<()> {
661
- let mut arrow_fields = Vec::new();
662
-
663
- let message_type = "
664
- message test_schema {
665
- REQUIRED GROUP my_list1 (LIST) {
666
- REPEATED GROUP list {
667
- OPTIONAL BINARY element (UTF8);
668
- }
669
- }
670
- OPTIONAL GROUP my_list2 (LIST) {
671
- REPEATED GROUP list {
672
- REQUIRED BINARY element (UTF8);
673
- }
674
- }
675
- REQUIRED GROUP my_list3 (LIST) {
676
- REPEATED GROUP list {
677
- REQUIRED BINARY element (UTF8);
678
- }
679
- }
680
- }
681
- ";
682
-
683
- // // List<String> (list non-null, elements nullable)
684
- // required group my_list1 (LIST) {
685
- // repeated group list {
686
- // optional binary element (UTF8);
687
- // }
688
- // }
689
- {
690
- arrow_fields.push(Field::new(
691
- "my_list1",
692
- DataType::List(Box::new(Field::new("element", DataType::Utf8, true))),
693
- false,
694
- ));
695
- }
696
-
697
- // // List<String> (list nullable, elements non-null)
698
- // optional group my_list2 (LIST) {
699
- // repeated group list {
700
- // required binary element (UTF8);
701
- // }
702
- // }
703
- {
704
- arrow_fields.push(Field::new(
705
- "my_list2",
706
- DataType::List(Box::new(Field::new("element", DataType::Utf8, false))),
707
- true,
708
- ));
709
- }
710
-
711
- // // List<String> (list non-null, elements non-null)
712
- // repeated group my_list3 (LIST) {
713
- // repeated group list {
714
- // required binary element (UTF8);
715
- // }
716
- // }
717
- {
718
- arrow_fields.push(Field::new(
719
- "my_list3",
720
- DataType::List(Box::new(Field::new("element", DataType::Utf8, false))),
721
- false,
722
- ));
723
- }
724
-
725
- let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;
726
- let fields = parquet_to_arrow_schema(parquet_schema.fields());
727
-
728
- assert_eq!(arrow_fields, fields);
729
- Ok(())
730
- }
731
-
732
- #[test]
733
- fn test_nested_schema() -> Result<()> {
734
- let mut arrow_fields = Vec::new();
735
- {
736
- let group1_fields = vec![
737
- Field::new("leaf1", DataType::Boolean, false),
738
- Field::new("leaf2", DataType::Int32, false),
739
- ];
740
- let group1_struct = Field::new("group1", DataType::Struct(group1_fields), false);
741
- arrow_fields.push(group1_struct);
742
-
743
- let leaf3_field = Field::new("leaf3", DataType::Int64, false);
744
- arrow_fields.push(leaf3_field);
745
- }
746
-
747
- let message_type = "
748
- message test_schema {
749
- REQUIRED GROUP group1 {
750
- REQUIRED BOOLEAN leaf1;
751
- REQUIRED INT32 leaf2;
752
- }
753
- REQUIRED INT64 leaf3;
754
- }
755
- ";
756
-
757
- let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;
758
- let fields = parquet_to_arrow_schema(parquet_schema.fields());
759
-
760
- assert_eq!(arrow_fields, fields);
761
- Ok(())
762
- }
763
-
764
- #[test]
765
- fn test_repeated_nested_schema() -> Result<()> {
766
- let mut arrow_fields = Vec::new();
767
- {
768
- arrow_fields.push(Field::new("leaf1", DataType::Int32, true));
769
-
770
- let inner_group_list = Field::new(
771
- "innerGroup",
772
- DataType::List(Box::new(Field::new(
773
- "innerGroup",
774
- DataType::Struct(vec![Field::new("leaf3", DataType::Int32, true)]),
775
- true,
776
- ))),
777
- true,
778
- );
779
-
780
- let outer_group_list = Field::new(
781
- "outerGroup",
782
- DataType::List(Box::new(Field::new(
783
- "outerGroup",
784
- DataType::Struct(vec![
785
- Field::new("leaf2", DataType::Int32, true),
786
- inner_group_list,
787
- ]),
788
- true,
789
- ))),
790
- true,
791
- );
792
- arrow_fields.push(outer_group_list);
793
- }
794
-
795
- let message_type = "
796
- message test_schema {
797
- OPTIONAL INT32 leaf1;
798
- REPEATED GROUP outerGroup {
799
- OPTIONAL INT32 leaf2;
800
- REPEATED GROUP innerGroup {
801
- OPTIONAL INT32 leaf3;
802
- }
803
- }
804
- }
805
- ";
806
-
807
- let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;
808
- let fields = parquet_to_arrow_schema(parquet_schema.fields());
809
-
810
- assert_eq!(arrow_fields, fields);
811
- Ok(())
812
- }
813
-
814
- #[test]
815
- fn test_column_desc_to_field() -> Result<()> {
816
- let message_type = "
817
- message test_schema {
818
- REQUIRED BOOLEAN boolean;
819
- REQUIRED INT32 int8 (INT_8);
820
- REQUIRED INT32 uint8 (INTEGER(8,false));
821
- REQUIRED INT32 int16 (INT_16);
822
- REQUIRED INT32 uint16 (INTEGER(16,false));
823
- REQUIRED INT32 int32;
824
- REQUIRED INT64 int64;
825
- OPTIONAL DOUBLE double;
826
- OPTIONAL FLOAT float;
827
- OPTIONAL BINARY string (UTF8);
828
- REPEATED BOOLEAN bools;
829
- OPTIONAL INT32 date (DATE);
830
- OPTIONAL INT32 time_milli (TIME_MILLIS);
831
- OPTIONAL INT64 time_micro (TIME_MICROS);
832
- OPTIONAL INT64 time_nano (TIME(NANOS,false));
833
- OPTIONAL INT64 ts_milli (TIMESTAMP_MILLIS);
834
- REQUIRED INT64 ts_micro (TIMESTAMP_MICROS);
835
- REQUIRED INT64 ts_nano (TIMESTAMP(NANOS,true));
836
- }
837
- ";
838
- let arrow_fields = vec![
839
- Field::new("boolean", DataType::Boolean, false),
840
- Field::new("int8", DataType::Int8, false),
841
- Field::new("uint8", DataType::UInt8, false),
842
- Field::new("int16", DataType::Int16, false),
843
- Field::new("uint16", DataType::UInt16, false),
844
- Field::new("int32", DataType::Int32, false),
845
- Field::new("int64", DataType::Int64, false),
846
- Field::new("double", DataType::Float64, true),
847
- Field::new("float", DataType::Float32, true),
848
- Field::new("string", DataType::Utf8, true),
849
- Field::new(
850
- "bools",
851
- DataType::List(Box::new(Field::new("bools", DataType::Boolean, true))),
852
- true,
853
- ),
854
- Field::new("date", DataType::Date32, true),
855
- Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true),
856
- Field::new("time_micro", DataType::Time64(TimeUnit::Microsecond), true),
857
- Field::new("time_nano", DataType::Time64(TimeUnit::Nanosecond), true),
858
- Field::new(
859
- "ts_milli",
860
- DataType::Timestamp(TimeUnit::Millisecond, None),
861
- true,
862
- ),
863
- Field::new(
864
- "ts_micro",
865
- DataType::Timestamp(TimeUnit::Microsecond, None),
866
- false,
867
- ),
868
- Field::new(
869
- "ts_nano",
870
- DataType::Timestamp(TimeUnit::Nanosecond, Some("+00:00".to_string())),
871
- false,
872
- ),
873
- ];
874
-
875
- let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;
876
- let fields = parquet_to_arrow_schema(parquet_schema.fields());
877
-
878
- assert_eq!(arrow_fields, fields);
879
- Ok(())
880
- }
881
-
882
- #[test]
883
- fn test_field_to_column_desc() -> Result<()> {
884
- let message_type = "
885
- message arrow_schema {
886
- REQUIRED BOOLEAN boolean;
887
- REQUIRED INT32 int8 (INT_8);
888
- REQUIRED INT32 int16 (INTEGER(16,true));
889
- REQUIRED INT32 int32;
890
- REQUIRED INT64 int64;
891
- OPTIONAL DOUBLE double;
892
- OPTIONAL FLOAT float;
893
- OPTIONAL BINARY string (STRING);
894
- OPTIONAL GROUP bools (LIST) {
895
- REPEATED GROUP list {
896
- OPTIONAL BOOLEAN element;
897
- }
898
- }
899
- REQUIRED GROUP bools_non_null (LIST) {
900
- REPEATED GROUP list {
901
- REQUIRED BOOLEAN element;
902
- }
903
- }
904
- OPTIONAL INT32 date (DATE);
905
- OPTIONAL INT32 time_milli (TIME(MILLIS,false));
906
- OPTIONAL INT64 time_micro (TIME_MICROS);
907
- OPTIONAL INT64 ts_milli (TIMESTAMP_MILLIS);
908
- REQUIRED INT64 ts_micro (TIMESTAMP(MICROS,false));
909
- REQUIRED GROUP struct {
910
- REQUIRED BOOLEAN bools;
911
- REQUIRED INT32 uint32 (INTEGER(32,false));
912
- REQUIRED GROUP int32 (LIST) {
913
- REPEATED GROUP list {
914
- OPTIONAL INT32 element;
915
- }
916
- }
917
- }
918
- REQUIRED BINARY dictionary_strings (STRING);
919
- }
920
- ";
921
-
922
- let arrow_fields = vec![
923
- Field::new("boolean", DataType::Boolean, false),
924
- Field::new("int8", DataType::Int8, false),
925
- Field::new("int16", DataType::Int16, false),
926
- Field::new("int32", DataType::Int32, false),
927
- Field::new("int64", DataType::Int64, false),
928
- Field::new("double", DataType::Float64, true),
929
- Field::new("float", DataType::Float32, true),
930
- Field::new("string", DataType::Utf8, true),
931
- Field::new(
932
- "bools",
933
- DataType::List(Box::new(Field::new("element", DataType::Boolean, true))),
934
- true,
935
- ),
936
- Field::new(
937
- "bools_non_null",
938
- DataType::List(Box::new(Field::new("element", DataType::Boolean, false))),
939
- false,
940
- ),
941
- Field::new("date", DataType::Date32, true),
942
- Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true),
943
- Field::new("time_micro", DataType::Time64(TimeUnit::Microsecond), true),
944
- Field::new(
945
- "ts_milli",
946
- DataType::Timestamp(TimeUnit::Millisecond, None),
947
- true,
948
- ),
949
- Field::new(
950
- "ts_micro",
951
- DataType::Timestamp(TimeUnit::Microsecond, None),
952
- false,
953
- ),
954
- Field::new(
955
- "struct",
956
- DataType::Struct(vec![
957
- Field::new("bools", DataType::Boolean, false),
958
- Field::new("uint32", DataType::UInt32, false),
959
- Field::new(
960
- "int32",
961
- DataType::List(Box::new(Field::new("element", DataType::Int32, true))),
962
- false,
963
- ),
964
- ]),
965
- false,
966
- ),
967
- Field::new("dictionary_strings", DataType::Utf8, false),
968
- ];
969
-
970
- let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;
971
- let fields = parquet_to_arrow_schema(parquet_schema.fields());
972
-
973
- assert_eq!(arrow_fields, fields);
974
- Ok(())
975
- }
976
- }