zsv 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +36 -0
- data/LICENSE +21 -0
- data/README.md +311 -0
- data/ext/zsv/common.h +34 -0
- data/ext/zsv/extconf.rb +137 -0
- data/ext/zsv/options.c +126 -0
- data/ext/zsv/options.h +31 -0
- data/ext/zsv/options_internal.h +8 -0
- data/ext/zsv/parser.c +300 -0
- data/ext/zsv/parser.h +62 -0
- data/ext/zsv/row.c +122 -0
- data/ext/zsv/row.h +39 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/2db.c +756 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/2json.c +381 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/2tsv.c +228 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/builtin/help.c +123 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/builtin/license.c +39 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/builtin/register.c +104 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/builtin/thirdparty.c +41 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/builtin/unregister.c +1 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/builtin/version.c +14 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/check/simdutf_wrapper.h +19 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/check/utf8.c +116 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/check.c +194 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/cli.c +796 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/cli_const.h +41 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/cli_export.h +16 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/cli_ini.c +280 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/cli_internal.h +36 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/compare.c +913 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/compare.h +23 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/compare_added_column.c +20 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/compare_internal.h +140 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/compare_sort.c +91 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/compare_unique_colname.c +81 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/count-pull.c +82 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/count.c +404 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/desc.c +569 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/echo.c +365 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/ext_example/my_extension.c +366 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/ext_example/mysheet_extension.c +341 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/ext_template/YOUR_EXTENSION_zsvext.c +263 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/inih/ini.c +298 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/inih/ini.h +157 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/json_writer-1.01/json_numeric.c +177 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/json_writer-1.01/jsonwriter.c +444 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/json_writer-1.01/jsonwriter.h +145 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/json_writer-1.01/utils.c +110 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/memfile-1.0/include/memfile.h +15 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/memfile-1.0/src/memfile.c +64 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/sglib/sglib.h +1955 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/simdutf/simdutf.h +6802 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/sqlite3/sqlite3.c +230517 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/sqlite3/sqlite3.h +12174 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/sqlite3/sqlite3_and_csv_vtab.c +2 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/sqlite3/sqlite3_csv_vtab-mem.c +142 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/sqlite3/sqlite3_csv_vtab-mem.h +49 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/sqlite3/sqlite3_csv_vtab-zsv.c +485 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/sqlite3/sqlite3_csv_vtab.c +1015 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/sqlite3/sqlite3ext.h +663 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/sqlite3/vtab_helper.c +85 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/build/yajl-2.1.1/include/yajl/yajl_common.h +75 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/build/yajl-2.1.1/include/yajl/yajl_gen.h +167 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/build/yajl-2.1.1/include/yajl/yajl_parse.h +228 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/build/yajl-2.1.1/include/yajl/yajl_tree.h +186 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/build/yajl-2.1.1/include/yajl/yajl_version.h +23 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/api/yajl_common.h +76 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/api/yajl_gen.h +167 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/api/yajl_parse.h +238 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/api/yajl_tree.h +186 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/yajl.c +184 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/yajl_alloc.c +52 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/yajl_alloc.h +34 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/yajl_buf.c +103 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/yajl_buf.h +57 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/yajl_bytestack.h +69 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/yajl_encode.c +220 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/yajl_encode.h +34 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/yajl_gen.c +362 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/yajl_lex.c +764 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/yajl_lex.h +117 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/yajl_parser.c +508 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/yajl_parser.h +78 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/yajl_tree.c +505 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl/src/yajl_version.c +7 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl_helper/yajl_helper/json_value.h +59 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl_helper/yajl_helper/yajl_helper.h +208 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl_helper/yajl_helper.c +795 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/external/yajl_helper/yajl_helper_internal.h +28 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/flatten.c +851 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/jq.c +106 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/jq.h +6 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/mv.c +113 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/noop.c +90 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/overwrite.c +295 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/paste.c +175 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/pretty.c +693 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/prop.c +980 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/rm.c +131 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/select/fixed.c +130 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/select/internal.h +118 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/select/parallel.c +45 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/select/parallel.h +41 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/select/processing.c +107 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/select/rand.c +20 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/select/regex.c +61 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/select/search.c +14 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/select/selection.c +192 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/select/usage.c +72 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/select-pull.c +812 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/select.c +753 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/serialize.c +372 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/curses.h +15 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/cursor.c +119 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/errors.c +45 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/file.c +63 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/file.h +12 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/filter.c +166 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/handlers.c +214 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/handlers_internal.h +128 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/help.c +43 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/index.c +81 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/index.h +25 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/key-bindings.c +325 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/key-bindings.h +73 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/lexer.c +203 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/newline_handler.c +7 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/pivot.c +318 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/procedure.c +134 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/procedure.h +119 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/read-data.c +322 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/screen_buffer.c +203 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/screen_buffer.h +36 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/sheet-sql.c +167 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/sheet_internal.h +36 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/sqlfilter.c +153 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/terminfo.c +32 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/transformation.c +312 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/transformation.h +29 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/ui_buffer.c +266 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/usage.c +9 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet/utf8-width.c +60 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sheet.c +1007 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sql.c +453 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sql_internal.c +101 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/sql_internal.h +49 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/stack.c +393 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/arg.c +322 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/cache.c +228 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/cat.c +91 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/chunk.c +240 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/chunk.h +63 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/clock.c +57 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/db.c +148 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/dirs-no-jq.c +2 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/dirs.c +427 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/dirs_from_json.c +253 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/dirs_to_json.c +121 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/dl.c +20 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/emcc/fs_api.c +159 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/err.c +24 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/file-mem.c +180 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/file.c +256 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/index.c +197 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/index.h +49 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/jq.c +400 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/json.c +120 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/mem.c +18 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/memmem.c +132 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/os.c +178 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/overwrite.c +258 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/overwrite_writer.c +246 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/pcre2-8/pcre2-8-test.c +123 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/pcre2-8/pcre2-8.c +153 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/pcre2-8/pcre2-8.h +54 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/prop.c +267 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/signal.c +53 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/string.c +357 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/win/dir_exists_longpath.c +83 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/win/dl.c +33 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/win/fopen_longpath.c +184 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/win/foreach_dirent_longpath.c +292 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/win/io.c +259 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/win/io.h +13 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/win/mkdir_longpath.c +255 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/win/remove_longpath.c +96 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/utils/writer.c +361 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/zsv_command.h +40 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/zsv_command_standalone.c +16 -0
- data/ext/zsv/vendor/zsv-1.3.0/app/zsv_main.h +44 -0
- data/ext/zsv/vendor/zsv-1.3.0/examples/js/zsv_parser_api_dummy.c +3 -0
- data/ext/zsv/vendor/zsv-1.3.0/examples/lib/parse_by_chunk.c +100 -0
- data/ext/zsv/vendor/zsv-1.3.0/examples/lib/print_my_column.c +143 -0
- data/ext/zsv/vendor/zsv-1.3.0/examples/lib/pull.c +89 -0
- data/ext/zsv/vendor/zsv-1.3.0/examples/lib/simple.c +123 -0
- data/ext/zsv/vendor/zsv-1.3.0/fuzz/fuzz.c +16 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/api.h +336 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/common.h +361 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/ext/implementation.h +62 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/ext/implementation_private.h +113 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/ext/sheet.h +73 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/ext.h +329 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/arg.h +90 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/cache.h +49 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/clock.h +36 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/compiler.h +58 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/db.h +19 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/dirs.h +147 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/dl.h +22 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/emcc/fs_api.h +28 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/err.h +22 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/file-mem.h +17 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/file.h +99 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/jq.h +65 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/json.h +19 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/mem.h +19 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/memmem.h +13 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/os.h +54 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/overwrite.h +71 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/overwrite_writer.h +53 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/prop.h +107 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/signal.h +18 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/sql.h +11 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/string.h +148 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/utf8.h +41 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/win/dl.h +25 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/utils/writer.h +101 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv/zsv_export.h +33 -0
- data/ext/zsv/vendor/zsv-1.3.0/include/zsv.h +20 -0
- data/ext/zsv/vendor/zsv-1.3.0/src/vector_delim.c +60 -0
- data/ext/zsv/vendor/zsv-1.3.0/src/zsv.c +484 -0
- data/ext/zsv/vendor/zsv-1.3.0/src/zsv_internal.c +731 -0
- data/ext/zsv/vendor/zsv-1.3.0/src/zsv_scan_delim.c +285 -0
- data/ext/zsv/vendor/zsv-1.3.0/src/zsv_scan_fixed.c +88 -0
- data/ext/zsv/vendor/zsv-1.3.0/src/zsv_strencode.c +51 -0
- data/ext/zsv/zsv_ext.c +343 -0
- data/lib/zsv/version.rb +5 -0
- data/lib/zsv.rb +81 -0
- metadata +340 -0
|
@@ -0,0 +1,753 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (C) 2025 Liquidaty and zsv contributors. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* This file is part of zsv/lib, distributed under the MIT license as defined at
|
|
5
|
+
* https://opensource.org/licenses/MIT
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
#include <stdio.h>
|
|
9
|
+
#include <assert.h>
|
|
10
|
+
#ifdef _WIN32
|
|
11
|
+
#define _CRT_RAND_S /* for random number generator, used when sampling. must come before including stdlib.h */
|
|
12
|
+
#else
|
|
13
|
+
#include <sys/types.h> // off_t
|
|
14
|
+
#endif
|
|
15
|
+
#include <stdlib.h>
|
|
16
|
+
#include <stdint.h>
|
|
17
|
+
#include <string.h>
|
|
18
|
+
#include <ctype.h>
|
|
19
|
+
#include <time.h>
|
|
20
|
+
#include <stdarg.h>
|
|
21
|
+
|
|
22
|
+
// Added for pthreads and parallel I/O management
|
|
23
|
+
#include <pthread.h>
|
|
24
|
+
#include <string.h> // memcpy, free, etc.
|
|
25
|
+
|
|
26
|
+
#define ZSV_COMMAND select
|
|
27
|
+
#include "zsv_command.h"
|
|
28
|
+
|
|
29
|
+
#include <zsv/utils/writer.h>
|
|
30
|
+
#include <zsv/utils/utf8.h>
|
|
31
|
+
#include <zsv/utils/string.h>
|
|
32
|
+
#include <zsv/utils/mem.h>
|
|
33
|
+
#include <zsv/utils/memmem.h>
|
|
34
|
+
#include <zsv/utils/arg.h>
|
|
35
|
+
#include <zsv/utils/os.h>
|
|
36
|
+
#include <zsv/utils/file.h>
|
|
37
|
+
#include "utils/chunk.h"
|
|
38
|
+
|
|
39
|
+
#include "select/internal.h" // various defines and structs
|
|
40
|
+
#include "select/usage.c" // zsv_select_usage()
|
|
41
|
+
#include "select/rand.c" // demo_random_bw_1_and_100()
|
|
42
|
+
#include "select/fixed.c" // auto_detect_fixed_column_sizes()
|
|
43
|
+
#include "utils/cat.c"
|
|
44
|
+
|
|
45
|
+
// zsv_select_add_search(), zsv_select_search_str_delete()
|
|
46
|
+
#include "select/search.c"
|
|
47
|
+
|
|
48
|
+
// struct zsv_select_regex, zsv_select_add_regex(), zsv_select_regexs_delete()
|
|
49
|
+
#include "select/regex.c"
|
|
50
|
+
|
|
51
|
+
// zsv_select_cell_clean(), zsv_select_row_search_hit()
|
|
52
|
+
#include "select/processing.c"
|
|
53
|
+
|
|
54
|
+
// zsv_select_add_exclusion(), zsv_select_get_header_name(),
|
|
55
|
+
// zsv_select_check_exclusions_are_indexes()
|
|
56
|
+
#include "select/selection.c"
|
|
57
|
+
|
|
58
|
+
#ifndef ZSV_NO_PARALLEL
|
|
59
|
+
#include "select/parallel.c" // zsv_parallel_data_new(), zsv_parallel_data_delete()
|
|
60
|
+
|
|
61
|
+
#define ZSV_SELECT_PARALLEL_MIN_BYTES (1024 * 1024 * 2) // don't parallelize if < 2 MB of data (after header)
|
|
62
|
+
#define ZSV_SELECT_PARALLEL_BUFFER_SZ (1024 * 1024 * 8) // to do: make customizable or dynamic
|
|
63
|
+
|
|
64
|
+
static void zsv_select_data_row(void *ctx);
|
|
65
|
+
|
|
66
|
+
static void zsv_select_data_row_parallel_done(void *ctx) {
|
|
67
|
+
struct zsv_select_data *data = ctx;
|
|
68
|
+
data->next_row_start = zsv_cum_scanned_length(data->parser) - zsv_row_length_raw_bytes(data->parser);
|
|
69
|
+
zsv_abort(data->parser);
|
|
70
|
+
data->cancelled = 1;
|
|
71
|
+
}
|
|
72
|
+
static void zsv_select_data_row_parallel(void *ctx) {
|
|
73
|
+
struct zsv_select_data *data = ctx;
|
|
74
|
+
zsv_select_data_row(ctx);
|
|
75
|
+
|
|
76
|
+
if (UNLIKELY((off_t)zsv_cum_scanned_length(data->parser) >= data->end_offset_limit)) {
|
|
77
|
+
// parse one more row to get accurate next-row start
|
|
78
|
+
zsv_set_row_handler(data->parser, zsv_select_data_row_parallel_done);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
static void *zsv_select_process_chunk_internal(struct zsv_chunk_data *cdata) {
|
|
83
|
+
if (cdata->start_offset >= cdata->end_offset) {
|
|
84
|
+
cdata->skip = 1;
|
|
85
|
+
return NULL;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
struct zsv_select_data data = {0}; // local, non-shared zsv_select_data instance
|
|
89
|
+
|
|
90
|
+
// Copy necessary setup data from the global context
|
|
91
|
+
memcpy(&data, cdata->opts->ctx, sizeof(data));
|
|
92
|
+
data.parallel_data = NULL; // clear parallel data pointer in local copy
|
|
93
|
+
data.cancelled = 0; // necessary in case we are re-running due to incorrect chunk start
|
|
94
|
+
|
|
95
|
+
#ifdef HAVE_PCRE2_8
|
|
96
|
+
// duplicate data.search_regexs for thread safety
|
|
97
|
+
if (data.search_regexs)
|
|
98
|
+
data.search_regexs = zsv_select_regexs_dup(data.search_regexs);
|
|
99
|
+
#endif
|
|
100
|
+
|
|
101
|
+
struct zsv_opts opts = {0};
|
|
102
|
+
opts.max_columns = cdata->opts->max_columns;
|
|
103
|
+
opts.max_row_size = cdata->opts->max_row_size;
|
|
104
|
+
opts.delimiter = cdata->opts->delimiter;
|
|
105
|
+
opts.no_quotes = cdata->opts->no_quotes;
|
|
106
|
+
opts.verbose = cdata->opts->verbose;
|
|
107
|
+
opts.malformed_utf8_replace = cdata->opts->malformed_utf8_replace;
|
|
108
|
+
opts.errprintf = cdata->opts->errprintf;
|
|
109
|
+
opts.errf = cdata->opts->errf;
|
|
110
|
+
opts.errclose = cdata->opts->errclose;
|
|
111
|
+
opts.progress = cdata->opts->progress;
|
|
112
|
+
|
|
113
|
+
// set up input
|
|
114
|
+
FILE *stream = fopen(data.input_path, "rb");
|
|
115
|
+
if (!stream) {
|
|
116
|
+
cdata->status = zsv_status_error;
|
|
117
|
+
return NULL;
|
|
118
|
+
}
|
|
119
|
+
fseeko(stream, cdata->start_offset, SEEK_SET);
|
|
120
|
+
|
|
121
|
+
// set up output
|
|
122
|
+
struct zsv_csv_writer_options writer_opts = {0};
|
|
123
|
+
|
|
124
|
+
#ifdef __linux__
|
|
125
|
+
cdata->tmp_output_filename = zsv_get_temp_filename("zsvselect");
|
|
126
|
+
writer_opts.stream = fopen(cdata->tmp_output_filename, "wb");
|
|
127
|
+
#else
|
|
128
|
+
if (!(cdata->tmp_f = zsv_memfile_open(ZSV_SELECT_PARALLEL_BUFFER_SZ)) &&
|
|
129
|
+
!(cdata->tmp_f = zsv_memfile_open(ZSV_SELECT_PARALLEL_BUFFER_SZ / 2)) &&
|
|
130
|
+
!(cdata->tmp_f = zsv_memfile_open(ZSV_SELECT_PARALLEL_BUFFER_SZ / 4)) &&
|
|
131
|
+
!(cdata->tmp_f = zsv_memfile_open(ZSV_SELECT_PARALLEL_BUFFER_SZ / 8)))
|
|
132
|
+
cdata->tmp_f = zsv_memfile_open(0);
|
|
133
|
+
writer_opts.stream = cdata->tmp_f;
|
|
134
|
+
writer_opts.write = (size_t(*)(const void *restrict, size_t, size_t, void *restrict))zsv_memfile_write;
|
|
135
|
+
#endif
|
|
136
|
+
|
|
137
|
+
if (!writer_opts.stream) {
|
|
138
|
+
cdata->status = zsv_status_memory;
|
|
139
|
+
fclose(stream);
|
|
140
|
+
return NULL;
|
|
141
|
+
}
|
|
142
|
+
data.csv_writer = zsv_writer_new(&writer_opts);
|
|
143
|
+
|
|
144
|
+
// initialize parser
|
|
145
|
+
opts.stream = stream;
|
|
146
|
+
opts.row_handler = zsv_select_data_row_parallel;
|
|
147
|
+
opts.ctx = &data;
|
|
148
|
+
data.end_offset_limit = cdata->end_offset - cdata->start_offset; // set chunk boundary
|
|
149
|
+
data.parser = zsv_new(&opts);
|
|
150
|
+
|
|
151
|
+
// process
|
|
152
|
+
enum zsv_status status = zsv_status_ok;
|
|
153
|
+
while (status == zsv_status_ok && !zsv_signal_interrupted && !data.cancelled)
|
|
154
|
+
status = zsv_parse_more(data.parser);
|
|
155
|
+
|
|
156
|
+
#ifndef ZSV_NOPARALLEL
|
|
157
|
+
if (!data.next_row_start)
|
|
158
|
+
// unlikely, but maybe conceivable if chunk split was not accurate and
|
|
159
|
+
// a correctly-split chunk's last row entirely ate the next incorrectly-split chunk
|
|
160
|
+
data.next_row_start = zsv_cum_scanned_length(data.parser) + 1;
|
|
161
|
+
#endif
|
|
162
|
+
|
|
163
|
+
// clean up
|
|
164
|
+
zsv_delete(data.parser);
|
|
165
|
+
#ifdef HAVE_PCRE2_8
|
|
166
|
+
zsv_select_regexs_delete(data.search_regexs);
|
|
167
|
+
#endif
|
|
168
|
+
fflush(stream);
|
|
169
|
+
fclose(stream);
|
|
170
|
+
zsv_writer_delete(data.csv_writer);
|
|
171
|
+
#ifdef __linux__
|
|
172
|
+
fclose(writer_opts.stream);
|
|
173
|
+
#endif
|
|
174
|
+
cdata->actual_next_row_start = data.next_row_start + cdata->start_offset;
|
|
175
|
+
cdata->status = zsv_status_ok;
|
|
176
|
+
return NULL;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
static void *zsv_select_process_chunk(void *arg) {
|
|
180
|
+
struct zsv_chunk_data *cdata = (struct zsv_chunk_data *)arg;
|
|
181
|
+
return zsv_select_process_chunk_internal(cdata);
|
|
182
|
+
}
|
|
183
|
+
#endif // ZSV_NO_PARALLEL
|
|
184
|
+
|
|
185
|
+
// zsv_select_output_data_row(): output row data (No change needed)
|
|
186
|
+
static void zsv_select_output_data_row(struct zsv_select_data *data) {
|
|
187
|
+
unsigned int cnt = data->output_cols_count;
|
|
188
|
+
char first = 1;
|
|
189
|
+
if (data->prepend_line_number) {
|
|
190
|
+
zsv_writer_cell_zu(data->csv_writer, first, data->data_row_count);
|
|
191
|
+
first = 0;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/* print data row */
|
|
195
|
+
for (unsigned int i = 0; i < cnt; i++) { // for each output column
|
|
196
|
+
unsigned int in_ix = data->out2in[i].ix;
|
|
197
|
+
struct zsv_cell cell = zsv_get_cell(data->parser, in_ix);
|
|
198
|
+
if (UNLIKELY(data->any_clean != 0)) {
|
|
199
|
+
// leading/trailing white may have been converted to NULL for regex search
|
|
200
|
+
while (cell.len && *cell.str == '\0')
|
|
201
|
+
cell.str++, cell.len--;
|
|
202
|
+
while (cell.len && cell.str[cell.len - 1] == '\0')
|
|
203
|
+
cell.len--;
|
|
204
|
+
cell.str = zsv_select_cell_clean(data, cell.str, &cell.quoted, &cell.len);
|
|
205
|
+
}
|
|
206
|
+
if (VERY_UNLIKELY(data->distinct == ZSV_SELECT_DISTINCT_MERGE)) {
|
|
207
|
+
if (UNLIKELY(cell.len == 0)) {
|
|
208
|
+
for (struct zsv_select_uint_list *ix = data->out2in[i].merge.indexes; ix; ix = ix->next) {
|
|
209
|
+
unsigned int m_ix = ix->value;
|
|
210
|
+
cell = zsv_get_cell(data->parser, m_ix);
|
|
211
|
+
if (cell.len) {
|
|
212
|
+
if (UNLIKELY(data->any_clean != 0))
|
|
213
|
+
cell.str = zsv_select_cell_clean(data, cell.str, &cell.quoted, &cell.len);
|
|
214
|
+
if (cell.len)
|
|
215
|
+
break;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
zsv_writer_cell(data->csv_writer, first, cell.str, cell.len, cell.quoted);
|
|
221
|
+
first = 0;
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
static void zsv_select_data_row(void *ctx) {
|
|
226
|
+
struct zsv_select_data *data = ctx;
|
|
227
|
+
if (UNLIKELY(zsv_cell_count(data->parser) == 0 || data->cancelled))
|
|
228
|
+
return;
|
|
229
|
+
|
|
230
|
+
data->data_row_count++;
|
|
231
|
+
|
|
232
|
+
// check if we should skip this row
|
|
233
|
+
data->skip_this_row = 0;
|
|
234
|
+
if (UNLIKELY(data->skip_data_rows)) {
|
|
235
|
+
data->skip_data_rows--;
|
|
236
|
+
data->skip_this_row = 1;
|
|
237
|
+
} else if (UNLIKELY(data->sample_every_n || data->sample_pct)) {
|
|
238
|
+
data->skip_this_row = 1;
|
|
239
|
+
if (data->sample_every_n && data->data_row_count % data->sample_every_n == 1)
|
|
240
|
+
data->skip_this_row = 0;
|
|
241
|
+
if (data->sample_pct && demo_random_bw_1_and_100() <= data->sample_pct)
|
|
242
|
+
data->skip_this_row = 0;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
if (LIKELY(!data->skip_this_row)) {
|
|
246
|
+
// if we have a search filter, check that
|
|
247
|
+
char skip = 0;
|
|
248
|
+
skip = !zsv_select_row_search_hit(data);
|
|
249
|
+
if (!skip) {
|
|
250
|
+
|
|
251
|
+
// print the data row
|
|
252
|
+
zsv_select_output_data_row(data);
|
|
253
|
+
if (UNLIKELY(data->data_rows_limit > 0))
|
|
254
|
+
if (data->data_row_count + 1 >= data->data_rows_limit)
|
|
255
|
+
data->cancelled = 1;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
if (data->data_row_count % 25000 == 0 && data->verbose)
|
|
259
|
+
fprintf(stderr, "Processed %zu rows\n", data->data_row_count);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
static void zsv_select_print_header_row(struct zsv_select_data *data) {
|
|
263
|
+
if (data->no_header)
|
|
264
|
+
return;
|
|
265
|
+
zsv_writer_cell_prepend(data->csv_writer, (const unsigned char *)data->prepend_header);
|
|
266
|
+
if (data->prepend_line_number)
|
|
267
|
+
zsv_writer_cell_s(data->csv_writer, 1, (const unsigned char *)"#", 0);
|
|
268
|
+
for (unsigned int i = 0; i < data->output_cols_count; i++) {
|
|
269
|
+
unsigned char *header_name = zsv_select_get_header_name(data, data->out2in[i].ix);
|
|
270
|
+
zsv_writer_cell_s(data->csv_writer, i == 0 && !data->prepend_line_number, header_name, 1);
|
|
271
|
+
}
|
|
272
|
+
zsv_writer_cell_prepend(data->csv_writer, NULL);
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
#ifndef ZSV_NO_PARALLEL
|
|
276
|
+
static int zsv_setup_parallel_chunks(struct zsv_select_data *data, const char *path, size_t header_row_end) {
|
|
277
|
+
if (data->num_chunks <= 1 || !path || !strcmp(path, "-")) {
|
|
278
|
+
data->run_in_parallel = 0;
|
|
279
|
+
return 0;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
struct zsv_chunk_position *offsets =
|
|
283
|
+
zsv_guess_file_chunks(path, data->num_chunks, ZSV_SELECT_PARALLEL_MIN_BYTES, header_row_end + 1
|
|
284
|
+
#ifndef ZSV_NO_ONLY_CRLF
|
|
285
|
+
,
|
|
286
|
+
data->opts->only_crlf_rowend
|
|
287
|
+
#endif
|
|
288
|
+
);
|
|
289
|
+
if (!offsets)
|
|
290
|
+
return -1; // fall back to serial
|
|
291
|
+
|
|
292
|
+
if (!(data->parallel_data = zsv_parallel_data_new(data->num_chunks))) {
|
|
293
|
+
zsv_free_chunks(offsets);
|
|
294
|
+
fprintf(stderr, "Insufficient memory to parallelize!\n");
|
|
295
|
+
return zsv_status_memory;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
data->run_in_parallel = 1;
|
|
299
|
+
data->parallel_data->main_data = data;
|
|
300
|
+
data->end_offset_limit = offsets[0].end;
|
|
301
|
+
|
|
302
|
+
for (unsigned int i = 0; i < data->num_chunks; i++) {
|
|
303
|
+
data->parallel_data->chunk_data[i].start_offset = offsets[i].start;
|
|
304
|
+
data->parallel_data->chunk_data[i].end_offset = offsets[i].end;
|
|
305
|
+
if (data->opts->verbose)
|
|
306
|
+
fprintf(stderr, "Chunk %i: %zu - %zu\n", i, (size_t)offsets[i].start, (size_t)offsets[i].end);
|
|
307
|
+
}
|
|
308
|
+
zsv_free_chunks(offsets);
|
|
309
|
+
return 0;
|
|
310
|
+
}
|
|
311
|
+
#endif // ZSV_NO_PARALLEL
|
|
312
|
+
|
|
313
|
+
static void zsv_select_header_finish(struct zsv_select_data *data) {
|
|
314
|
+
if (zsv_select_set_output_columns(data)) {
|
|
315
|
+
data->cancelled = 1;
|
|
316
|
+
return;
|
|
317
|
+
}
|
|
318
|
+
#ifndef ZSV_NO_PARALLEL
|
|
319
|
+
// set up parallelization; on error, fall back to serial
|
|
320
|
+
// TO DO: option to exit on error (instead of fall back)
|
|
321
|
+
if (data->input_path && data->num_chunks > 1) {
|
|
322
|
+
size_t header_row_end = zsv_cum_scanned_length(data->parser);
|
|
323
|
+
zsv_setup_parallel_chunks(data, data->input_path, header_row_end);
|
|
324
|
+
}
|
|
325
|
+
if (data->opts->verbose)
|
|
326
|
+
fprintf(stderr, "Running %s\n", data->run_in_parallel ? "parallel" : "single-threaded");
|
|
327
|
+
|
|
328
|
+
if (data->run_in_parallel) {
|
|
329
|
+
struct zsv_parallel_data *pdata = data->parallel_data;
|
|
330
|
+
zsv_select_print_header_row(data);
|
|
331
|
+
|
|
332
|
+
// start worker threads
|
|
333
|
+
for (unsigned int i = 1; i < data->num_chunks; i++) {
|
|
334
|
+
struct zsv_chunk_data *cdata = &pdata->chunk_data[i];
|
|
335
|
+
cdata->id = i;
|
|
336
|
+
cdata->opts = data->opts;
|
|
337
|
+
|
|
338
|
+
int create_status = pthread_create(&pdata->threads[i - 1], NULL, zsv_select_process_chunk, cdata);
|
|
339
|
+
if (create_status != 0) {
|
|
340
|
+
data->cancelled = 1;
|
|
341
|
+
zsv_printerr(1, "Error creating worker thread for chunk %d: %s", i, strerror(create_status));
|
|
342
|
+
return;
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// main thread processes chunk 1
|
|
347
|
+
zsv_set_row_handler(data->parser, zsv_select_data_row_parallel);
|
|
348
|
+
} else
|
|
349
|
+
#endif
|
|
350
|
+
{
|
|
351
|
+
// no parallelization
|
|
352
|
+
zsv_select_print_header_row(data);
|
|
353
|
+
zsv_set_row_handler(data->parser, zsv_select_data_row);
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
static void zsv_select_header_row(void *ctx) {
|
|
358
|
+
struct zsv_select_data *data = ctx;
|
|
359
|
+
|
|
360
|
+
if (data->cancelled)
|
|
361
|
+
return;
|
|
362
|
+
|
|
363
|
+
unsigned int cols = zsv_cell_count(data->parser);
|
|
364
|
+
unsigned int max_header_ix = 0;
|
|
365
|
+
for (unsigned int i = 0; i < cols; i++) {
|
|
366
|
+
struct zsv_cell cell = zsv_get_cell(data->parser, i);
|
|
367
|
+
if (UNLIKELY(data->any_clean != 0))
|
|
368
|
+
cell.str = zsv_select_cell_clean(data, cell.str, &cell.quoted, &cell.len);
|
|
369
|
+
if (i < data->opts->max_columns) {
|
|
370
|
+
data->header_names[i] = zsv_memdup(cell.str, cell.len);
|
|
371
|
+
max_header_ix = i + 1;
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
// in case we want to make this an option later
|
|
376
|
+
char trim_trailing_columns = 1;
|
|
377
|
+
if (!trim_trailing_columns)
|
|
378
|
+
max_header_ix = cols;
|
|
379
|
+
|
|
380
|
+
if (max_header_ix > data->header_name_count)
|
|
381
|
+
data->header_name_count = max_header_ix;
|
|
382
|
+
|
|
383
|
+
zsv_select_header_finish(data);
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
static void zsv_select_cleanup(struct zsv_select_data *data) {
|
|
387
|
+
if (data->opts->stream && data->opts->stream != stdin)
|
|
388
|
+
fclose(data->opts->stream);
|
|
389
|
+
|
|
390
|
+
zsv_writer_delete(data->csv_writer);
|
|
391
|
+
zsv_select_search_str_delete(data->search_strings);
|
|
392
|
+
#ifdef HAVE_PCRE2_8
|
|
393
|
+
zsv_select_regexs_delete(data->search_regexs);
|
|
394
|
+
#endif
|
|
395
|
+
|
|
396
|
+
if (data->distinct == ZSV_SELECT_DISTINCT_MERGE) {
|
|
397
|
+
for (unsigned int i = 0; i < data->output_cols_count; i++) {
|
|
398
|
+
for (struct zsv_select_uint_list *next, *ix = data->out2in[i].merge.indexes; ix; ix = next) {
|
|
399
|
+
next = ix->next;
|
|
400
|
+
free(ix);
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
free(data->out2in);
|
|
405
|
+
|
|
406
|
+
for (unsigned int i = 0; i < data->header_name_count; i++)
|
|
407
|
+
free(data->header_names[i]);
|
|
408
|
+
free(data->header_names);
|
|
409
|
+
|
|
410
|
+
free(data->fixed.offsets);
|
|
411
|
+
|
|
412
|
+
#ifndef ZSV_NO_PARALLEL
|
|
413
|
+
if (data->run_in_parallel)
|
|
414
|
+
zsv_parallel_data_delete(data->parallel_data);
|
|
415
|
+
#endif
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
#define ARG_require_val(tgt, conv_func) \
|
|
419
|
+
do { \
|
|
420
|
+
if (++arg_i >= argc) { \
|
|
421
|
+
stat = zsv_printerr(1, "%s option requires parameter", argv[arg_i - 1]); \
|
|
422
|
+
goto zsv_select_main_done; \
|
|
423
|
+
} \
|
|
424
|
+
tgt = conv_func(argv[arg_i]); \
|
|
425
|
+
} while (0)
|
|
426
|
+
|
|
427
|
+
#ifndef ZSV_NO_PARALLEL
|
|
428
|
+
static int zsv_merge_worker_outputs(struct zsv_select_data *data, FILE *dest_stream) {
|
|
429
|
+
if (!data->run_in_parallel || !data->parallel_data)
|
|
430
|
+
return 0;
|
|
431
|
+
|
|
432
|
+
fflush(dest_stream);
|
|
433
|
+
#ifdef __linux__
|
|
434
|
+
int out_fd = fileno(dest_stream);
|
|
435
|
+
#endif
|
|
436
|
+
int status = 0;
|
|
437
|
+
|
|
438
|
+
for (unsigned int i = 0; i < data->num_chunks - 1; i++) {
|
|
439
|
+
pthread_join(data->parallel_data->threads[i], NULL);
|
|
440
|
+
|
|
441
|
+
struct zsv_chunk_data *next_chunk = &data->parallel_data->chunk_data[i + 1];
|
|
442
|
+
off_t actual_next_row_start =
|
|
443
|
+
i == 0 ? data->next_row_start : data->parallel_data->chunk_data[i].actual_next_row_start;
|
|
444
|
+
off_t expected_next_row_start = next_chunk->start_offset;
|
|
445
|
+
if (actual_next_row_start > expected_next_row_start) {
|
|
446
|
+
if (data->opts->verbose) {
|
|
447
|
+
fprintf(stderr, "Chunk overlap detected (Prev End: %zu, Next Start: %zu). Reprocessing chunk %d.\n",
|
|
448
|
+
(size_t)actual_next_row_start, (size_t)expected_next_row_start, i + 1);
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
// clean up invalid results from the worker thread
|
|
452
|
+
zsv_chunk_data_clear_output(next_chunk);
|
|
453
|
+
|
|
454
|
+
// adjust the start offset to the actual next row start
|
|
455
|
+
next_chunk->start_offset = actual_next_row_start;
|
|
456
|
+
|
|
457
|
+
// reprocess synchronously on the main thread
|
|
458
|
+
zsv_select_process_chunk_internal(next_chunk);
|
|
459
|
+
|
|
460
|
+
if (next_chunk->status != zsv_status_ok) // reprocessing failed!
|
|
461
|
+
status = zsv_status_error;
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
// join all of the output files into a single output file
|
|
466
|
+
for (unsigned int i = 1; i < data->num_chunks && status == 0; i++) {
|
|
467
|
+
struct zsv_chunk_data *c = &data->parallel_data->chunk_data[i];
|
|
468
|
+
if (c->skip)
|
|
469
|
+
continue;
|
|
470
|
+
#ifdef __linux__
|
|
471
|
+
int in_fd = open(c->tmp_output_filename, O_RDONLY);
|
|
472
|
+
if (in_fd < 0) {
|
|
473
|
+
zsv_printerr(1, "Error opening chunk %s: %s", c->tmp_output_filename, strerror(errno));
|
|
474
|
+
status = zsv_status_error;
|
|
475
|
+
break;
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
struct stat st;
|
|
479
|
+
if (fstat(in_fd, &st) == 0) {
|
|
480
|
+
long copied = zsv_concatenate_copy(out_fd, in_fd, st.st_size);
|
|
481
|
+
if (copied != st.st_size) {
|
|
482
|
+
zsv_printerr(1, "Warning: Partial copy chunk %d (%lli/%lli)", i, copied, (long long)st.st_size);
|
|
483
|
+
status = zsv_status_error;
|
|
484
|
+
}
|
|
485
|
+
} else {
|
|
486
|
+
status = zsv_status_error;
|
|
487
|
+
}
|
|
488
|
+
close(in_fd);
|
|
489
|
+
#else
|
|
490
|
+
zsv_memfile_rewind(c->tmp_f);
|
|
491
|
+
if (zsv_copy_filelike_ptr(
|
|
492
|
+
c->tmp_f, (size_t(*)(void *restrict ptr, size_t size, size_t nitems, void *restrict stream))zsv_memfile_read,
|
|
493
|
+
dest_stream,
|
|
494
|
+
(size_t(*)(const void *restrict ptr, size_t size, size_t nitems, void *restrict stream))fwrite)) {
|
|
495
|
+
perror("zsv temp mem file");
|
|
496
|
+
status = zsv_status_error;
|
|
497
|
+
}
|
|
498
|
+
#endif
|
|
499
|
+
}
|
|
500
|
+
return status;
|
|
501
|
+
}
|
|
502
|
+
#endif
|
|
503
|
+
|
|
504
|
+
int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *opts,
|
|
505
|
+
struct zsv_prop_handler *custom_prop_handler) {
|
|
506
|
+
if (argc > 1 && (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))) {
|
|
507
|
+
zsv_select_usage();
|
|
508
|
+
return zsv_status_ok;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
struct zsv_select_data data = {0};
|
|
512
|
+
data.opts = opts;
|
|
513
|
+
struct zsv_csv_writer_options writer_opts = zsv_writer_get_default_opts();
|
|
514
|
+
int col_index_arg_i = 0;
|
|
515
|
+
unsigned char *preview_buff = NULL;
|
|
516
|
+
size_t preview_buff_len = 0;
|
|
517
|
+
enum zsv_status stat = zsv_status_ok;
|
|
518
|
+
|
|
519
|
+
for (int arg_i = 1; stat == zsv_status_ok && arg_i < argc; arg_i++) {
|
|
520
|
+
const char *arg = argv[arg_i];
|
|
521
|
+
if (!strcmp(arg, "--")) {
|
|
522
|
+
col_index_arg_i = arg_i + 1;
|
|
523
|
+
break;
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
if (!strcmp(arg, "-b") || !strcmp(arg, "--with-bom"))
|
|
527
|
+
writer_opts.with_bom = 1;
|
|
528
|
+
else if (!strcmp(arg, "--fixed-auto-max-lines"))
|
|
529
|
+
ARG_require_val(data.fixed.max_lines, atoi);
|
|
530
|
+
else if (!strcmp(arg, "--fixed-auto"))
|
|
531
|
+
data.fixed.autodetect = 1;
|
|
532
|
+
else if (!strcmp(arg, "--fixed")) {
|
|
533
|
+
if (++arg_i >= argc) {
|
|
534
|
+
stat = zsv_printerr(1, "--fixed requires val");
|
|
535
|
+
goto zsv_select_main_done;
|
|
536
|
+
}
|
|
537
|
+
data.fixed.count = 1;
|
|
538
|
+
for (const char *s = argv[arg_i]; *s; s++)
|
|
539
|
+
if (*s == ',')
|
|
540
|
+
data.fixed.count++;
|
|
541
|
+
free(data.fixed.offsets);
|
|
542
|
+
data.fixed.offsets = calloc(data.fixed.count, sizeof(*data.fixed.offsets));
|
|
543
|
+
if (!data.fixed.offsets) {
|
|
544
|
+
stat = zsv_printerr(1, "Out of memory!");
|
|
545
|
+
goto zsv_select_main_done;
|
|
546
|
+
}
|
|
547
|
+
size_t count = 0;
|
|
548
|
+
char *dup = strdup(argv[arg_i]), *tok;
|
|
549
|
+
for (tok = strtok(dup, ","); tok && count < data.fixed.count; tok = strtok(NULL, ",")) {
|
|
550
|
+
if (sscanf(tok, "%zu", &data.fixed.offsets[count++]) != 1)
|
|
551
|
+
stat = zsv_printerr(1, "Invalid offset: %s", tok);
|
|
552
|
+
}
|
|
553
|
+
free(dup);
|
|
554
|
+
} else if (!strcmp(arg, "--distinct"))
|
|
555
|
+
data.distinct = 1;
|
|
556
|
+
else if (!strcmp(arg, "--merge"))
|
|
557
|
+
data.distinct = ZSV_SELECT_DISTINCT_MERGE;
|
|
558
|
+
else if (!strcmp(arg, "-o") || !strcmp(arg, "--output")) {
|
|
559
|
+
if (writer_opts.stream && writer_opts.stream != stdout)
|
|
560
|
+
stat = zsv_printerr(1, "Output specified twice");
|
|
561
|
+
else {
|
|
562
|
+
ARG_require_val(arg, (const char *));
|
|
563
|
+
if (!(writer_opts.stream = fopen(arg, "wb")))
|
|
564
|
+
stat = zsv_printerr(1, "Unable to open %s", arg);
|
|
565
|
+
}
|
|
566
|
+
} else if (!strcmp(arg, "-N") || !strcmp(arg, "--line-number"))
|
|
567
|
+
data.prepend_line_number = 1;
|
|
568
|
+
else if (!strcmp(arg, "-n"))
|
|
569
|
+
data.use_header_indexes = 1;
|
|
570
|
+
else if (!strcmp(arg, "-s") || !strcmp(arg, "--search")) {
|
|
571
|
+
const char *v;
|
|
572
|
+
ARG_require_val(v, (const char *));
|
|
573
|
+
zsv_select_add_search(&data, v);
|
|
574
|
+
}
|
|
575
|
+
#ifdef HAVE_PCRE2_8
|
|
576
|
+
else if (!strcmp(arg, "--regex-search")) {
|
|
577
|
+
const char *v;
|
|
578
|
+
ARG_require_val(v, (const char *));
|
|
579
|
+
zsv_select_add_regex(&data, v);
|
|
580
|
+
}
|
|
581
|
+
#endif
|
|
582
|
+
else if (!strcmp(arg, "-v") || !strcmp(arg, "--verbose"))
|
|
583
|
+
data.verbose = 1;
|
|
584
|
+
else if (!strcmp(arg, "--unescape"))
|
|
585
|
+
data.unescape = 1;
|
|
586
|
+
else if (!strcmp(arg, "-w") || !strcmp(arg, "--whitespace-clean"))
|
|
587
|
+
data.clean_white = 1;
|
|
588
|
+
else if (!strcmp(arg, "--whitespace-clean-no-newline")) {
|
|
589
|
+
data.clean_white = 1;
|
|
590
|
+
data.whitespace_clean_flags = 1;
|
|
591
|
+
} else if (!strcmp(arg, "-W") || !strcmp(arg, "--no-trim"))
|
|
592
|
+
data.no_trim_whitespace = 1;
|
|
593
|
+
else if (!strcmp(arg, "--sample-every"))
|
|
594
|
+
ARG_require_val(data.sample_every_n, atoi);
|
|
595
|
+
else if (!strcmp(arg, "--sample-pct"))
|
|
596
|
+
ARG_require_val(data.sample_pct, atof);
|
|
597
|
+
else if (!strcmp(arg, "--prepend-header")) {
|
|
598
|
+
int err = 0;
|
|
599
|
+
data.prepend_header = zsv_next_arg(++arg_i, argc, argv, &err);
|
|
600
|
+
if (err)
|
|
601
|
+
stat = zsv_status_error;
|
|
602
|
+
} else if (!strcmp(arg, "--no-header"))
|
|
603
|
+
data.no_header = 1;
|
|
604
|
+
else if (!strcmp(arg, "-H") || !strcmp(arg, "--head")) {
|
|
605
|
+
int val;
|
|
606
|
+
ARG_require_val(val, atoi);
|
|
607
|
+
data.data_rows_limit = val + 1;
|
|
608
|
+
} else if (!strcmp(arg, "-D") || !strcmp(arg, "--skip-data"))
|
|
609
|
+
ARG_require_val(data.skip_data_rows, atoi);
|
|
610
|
+
#ifndef ZSV_NO_PARALLEL
|
|
611
|
+
else if (!strcmp(arg, "-j") || !strcmp(arg, "--jobs"))
|
|
612
|
+
ARG_require_val(data.num_chunks, atoi);
|
|
613
|
+
else if (!strcmp(arg, "--parallel")) {
|
|
614
|
+
data.num_chunks = zsv_get_number_of_cores();
|
|
615
|
+
if (data.num_chunks < 2) {
|
|
616
|
+
fprintf(stderr, "Warning: --parallel specified but only one core found; using -j 4 instead");
|
|
617
|
+
data.num_chunks = 4;
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
#endif
|
|
621
|
+
else if (!strcmp(arg, "-e")) {
|
|
622
|
+
const char *v;
|
|
623
|
+
ARG_require_val(v, (const char *));
|
|
624
|
+
data.embedded_lineend = *v;
|
|
625
|
+
} else if (!strcmp(arg, "-x")) {
|
|
626
|
+
const char *v;
|
|
627
|
+
ARG_require_val(v, (const char *));
|
|
628
|
+
zsv_select_add_exclusion(&data, v);
|
|
629
|
+
} else if (*arg == '-')
|
|
630
|
+
stat = zsv_printerr(1, "Unrecognized argument: %s", arg);
|
|
631
|
+
else if (data.input_path)
|
|
632
|
+
stat = zsv_printerr(1, "Input specified twice");
|
|
633
|
+
else
|
|
634
|
+
data.input_path = arg;
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
if (stat != zsv_status_ok)
|
|
638
|
+
goto zsv_select_main_done;
|
|
639
|
+
|
|
640
|
+
// configuration & setup
|
|
641
|
+
if (!writer_opts.stream)
|
|
642
|
+
writer_opts.stream = stdout;
|
|
643
|
+
if (data.sample_pct)
|
|
644
|
+
srand(time(0));
|
|
645
|
+
if (data.use_header_indexes && (stat = zsv_select_check_exclusions_are_indexes(&data)))
|
|
646
|
+
goto zsv_select_main_done;
|
|
647
|
+
|
|
648
|
+
#ifndef ZSV_NO_PARALLEL
|
|
649
|
+
if (data.num_chunks > 1) {
|
|
650
|
+
enum zsv_chunk_status chstat = zsv_chunkable(data.input_path, data.opts);
|
|
651
|
+
if (chstat != zsv_chunk_status_ok) {
|
|
652
|
+
stat = zsv_printerr(1, "%s", zsv_chunk_status_str(chstat));
|
|
653
|
+
goto zsv_select_main_done;
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
#endif
|
|
657
|
+
|
|
658
|
+
// input stream
|
|
659
|
+
if (data.input_path) {
|
|
660
|
+
if (!(data.opts->stream = fopen(data.input_path, "rb")))
|
|
661
|
+
stat = zsv_printerr(1, "Cannot open %s", data.input_path);
|
|
662
|
+
} else {
|
|
663
|
+
#ifdef NO_STDIN
|
|
664
|
+
stat = zsv_printerr(1, "Input file required");
|
|
665
|
+
goto zsv_select_main_done;
|
|
666
|
+
#else
|
|
667
|
+
data.opts->stream = stdin;
|
|
668
|
+
#endif
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
// auto-fixed column detection
|
|
672
|
+
if (data.fixed.autodetect) { // fixed-auto flag
|
|
673
|
+
if (data.fixed.count)
|
|
674
|
+
stat = zsv_printerr(1, "--fixed-auto cannot be used with --fixed");
|
|
675
|
+
else {
|
|
676
|
+
size_t bsz = 1024 * 256;
|
|
677
|
+
if (!(preview_buff = calloc(bsz, 1)))
|
|
678
|
+
stat = zsv_status_memory;
|
|
679
|
+
else
|
|
680
|
+
stat =
|
|
681
|
+
auto_detect_fixed_column_sizes(&data.fixed, data.opts, preview_buff, bsz, &preview_buff_len, opts->verbose);
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
if (stat != zsv_status_ok)
|
|
685
|
+
goto zsv_select_main_done;
|
|
686
|
+
|
|
687
|
+
// parser initialization
|
|
688
|
+
if (col_index_arg_i) {
|
|
689
|
+
data.col_argv = &argv[col_index_arg_i];
|
|
690
|
+
data.col_argc = argc - col_index_arg_i;
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
data.header_names = calloc(data.opts->max_columns, sizeof(*data.header_names));
|
|
694
|
+
data.out2in = calloc(data.opts->max_columns, sizeof(*data.out2in));
|
|
695
|
+
data.csv_writer = zsv_writer_new(&writer_opts);
|
|
696
|
+
|
|
697
|
+
if (!data.header_names || !data.out2in || !data.csv_writer) {
|
|
698
|
+
stat = zsv_status_memory;
|
|
699
|
+
goto zsv_select_main_done;
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
// execution
|
|
703
|
+
data.opts->row_handler = zsv_select_header_row;
|
|
704
|
+
data.opts->ctx = &data;
|
|
705
|
+
|
|
706
|
+
if (zsv_new_with_properties(data.opts, custom_prop_handler, data.input_path, &data.parser) == zsv_status_ok) {
|
|
707
|
+
data.any_clean = !data.no_trim_whitespace || data.clean_white || data.embedded_lineend || data.unescape;
|
|
708
|
+
|
|
709
|
+
// apply fixed offsets (whether from --fixed arg or --fixed-auto detection)
|
|
710
|
+
if (data.fixed.count && zsv_set_fixed_offsets(data.parser, data.fixed.count, data.fixed.offsets) != zsv_status_ok)
|
|
711
|
+
data.cancelled = 1;
|
|
712
|
+
|
|
713
|
+
unsigned char writer_buff[512];
|
|
714
|
+
zsv_writer_set_temp_buff(data.csv_writer, writer_buff, sizeof(writer_buff));
|
|
715
|
+
|
|
716
|
+
zsv_handle_ctrl_c_signal();
|
|
717
|
+
|
|
718
|
+
enum zsv_status p_stat = zsv_status_ok;
|
|
719
|
+
if (preview_buff_len)
|
|
720
|
+
p_stat = zsv_parse_bytes(data.parser, preview_buff, preview_buff_len);
|
|
721
|
+
|
|
722
|
+
while (p_stat == zsv_status_ok && !zsv_signal_interrupted && !data.cancelled)
|
|
723
|
+
p_stat = zsv_parse_more(data.parser);
|
|
724
|
+
|
|
725
|
+
if (p_stat == zsv_status_no_more_input) {
|
|
726
|
+
zsv_finish(data.parser);
|
|
727
|
+
#ifndef ZSV_NO_PARALLEL
|
|
728
|
+
// unlikely, but maybe conceivable if chunk split was not accurate and
|
|
729
|
+
// a correctly-split chunk's last row entirely ate the next incorrectly-split chunk
|
|
730
|
+
if (data.run_in_parallel && !data.next_row_start)
|
|
731
|
+
data.next_row_start = zsv_cum_scanned_length(data.parser) + 1;
|
|
732
|
+
#endif
|
|
733
|
+
}
|
|
734
|
+
zsv_delete(data.parser);
|
|
735
|
+
|
|
736
|
+
#ifndef ZSV_NO_PARALLEL
|
|
737
|
+
if (data.run_in_parallel) {
|
|
738
|
+
// explicitly flush and delete main writer before merge which uses raw fd
|
|
739
|
+
zsv_writer_delete(data.csv_writer);
|
|
740
|
+
data.csv_writer = NULL;
|
|
741
|
+
if (zsv_merge_worker_outputs(&data, writer_opts.stream) != 0)
|
|
742
|
+
stat = zsv_status_error;
|
|
743
|
+
}
|
|
744
|
+
#endif
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
zsv_select_main_done:
|
|
748
|
+
free(preview_buff);
|
|
749
|
+
zsv_select_cleanup(&data);
|
|
750
|
+
if (writer_opts.stream && writer_opts.stream != stdout)
|
|
751
|
+
fclose(writer_opts.stream);
|
|
752
|
+
return stat;
|
|
753
|
+
}
|