rroonga 7.1.1-x86-mingw32 → 9.0.2-x86-mingw32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/Rakefile +3 -3
- data/doc/text/news.md +22 -0
- data/ext/groonga/extconf.rb +29 -26
- data/ext/groonga/rb-grn.h +3 -3
- data/lib/2.2/groonga.so +0 -0
- data/lib/2.3/groonga.so +0 -0
- data/lib/2.4/groonga.so +0 -0
- data/lib/2.5/groonga.so +0 -0
- data/lib/groonga/expression-builder.rb +1 -1
- data/lib/groonga/schema.rb +13 -0
- data/rroonga-build.rb +4 -11
- data/test/test-expression-builder.rb +8 -0
- data/vendor/local/bin/cv2pdb.exe +0 -0
- data/vendor/local/bin/generate-pdb.bat +38 -36
- data/vendor/local/bin/grndb.exe +0 -0
- data/vendor/local/bin/groonga-benchmark.exe +0 -0
- data/vendor/local/bin/groonga-suggest-create-dataset.exe +0 -0
- data/vendor/local/bin/groonga.exe +0 -0
- data/vendor/local/bin/libgroonga-0.dll +0 -0
- data/vendor/local/bin/libmecab-2.dll +0 -0
- data/vendor/local/bin/libmsgpackc.dll +0 -0
- data/vendor/local/bin/libonigmo-6.dll +0 -0
- data/vendor/local/bin/libpcre-1.dll +0 -0
- data/vendor/local/bin/libpcrecpp-0.dll +0 -0
- data/vendor/local/bin/libpcreposix-0.dll +0 -0
- data/vendor/local/bin/lz4.exe +0 -0
- data/vendor/local/bin/lz4c.exe +0 -0
- data/vendor/local/bin/{lz4cat → lz4cat.exe} +0 -0
- data/vendor/local/bin/mecab.exe +0 -0
- data/vendor/local/bin/pcre-config +1 -1
- data/vendor/local/bin/pcregrep.exe +0 -0
- data/vendor/local/bin/pcretest.exe +0 -0
- data/vendor/local/bin/unlz4.exe +0 -0
- data/vendor/local/bin/zlib1.dll +0 -0
- data/vendor/local/include/groonga/groonga.h +16 -1
- data/vendor/local/include/groonga/groonga/accessor.h +5 -1
- data/vendor/local/include/groonga/groonga/column.h +4 -0
- data/vendor/local/include/groonga/groonga/db.h +3 -1
- data/vendor/local/include/groonga/groonga/expr.h +5 -0
- data/vendor/local/include/groonga/groonga/groonga.h +124 -171
- data/vendor/local/include/groonga/groonga/highlighter.h +57 -0
- data/vendor/local/include/groonga/groonga/ii.h +2 -0
- data/vendor/local/include/groonga/groonga/index_column.h +31 -0
- data/vendor/local/include/groonga/groonga/memory.h +29 -0
- data/vendor/local/include/groonga/groonga/msgpack.h +50 -0
- data/vendor/local/include/groonga/groonga/obj.h +22 -1
- data/vendor/local/include/groonga/groonga/option.h +61 -0
- data/vendor/local/include/groonga/groonga/output.h +57 -2
- data/vendor/local/include/groonga/groonga/output_columns.h +38 -0
- data/vendor/local/include/groonga/groonga/plugin.h +5 -0
- data/vendor/local/include/groonga/groonga/raw_string.h +60 -0
- data/vendor/local/include/groonga/groonga/string.h +113 -0
- data/vendor/local/include/groonga/groonga/table.h +89 -1
- data/vendor/local/include/groonga/groonga/thread.h +15 -0
- data/vendor/local/include/groonga/groonga/time.h +1 -0
- data/vendor/local/include/groonga/groonga/token.h +60 -10
- data/vendor/local/include/groonga/groonga/token_cursor.h +59 -0
- data/vendor/local/include/groonga/groonga/token_filter.h +24 -0
- data/vendor/local/include/groonga/groonga/token_metadata.h +49 -0
- data/vendor/local/include/groonga/groonga/tokenizer.h +99 -25
- data/vendor/local/include/groonga/groonga/tokenizer_query_deprecated.h +50 -0
- data/vendor/local/include/groonga/groonga/vector.h +80 -0
- data/vendor/local/include/groonga/groonga/version.h +32 -0
- data/vendor/local/include/groonga/groonga/window_function.h +18 -8
- data/vendor/local/include/groonga/groonga/window_function_executor.h +68 -0
- data/vendor/local/include/lz4.h +504 -212
- data/vendor/local/include/lz4frame.h +433 -153
- data/vendor/local/include/lz4frame_static.h +47 -0
- data/vendor/local/include/lz4hc.h +281 -108
- data/vendor/local/include/msgpack.hpp +4 -0
- data/vendor/local/include/msgpack/adaptor/adaptor_base.hpp +1 -0
- data/vendor/local/include/msgpack/adaptor/adaptor_base_decl.hpp +1 -0
- data/vendor/local/include/msgpack/adaptor/array_ref_decl.hpp +1 -0
- data/vendor/local/include/msgpack/adaptor/boost/msgpack_variant_decl.hpp +1 -0
- data/vendor/local/include/msgpack/adaptor/boost/string_view.hpp +15 -0
- data/vendor/local/include/msgpack/adaptor/check_container_size_decl.hpp +1 -0
- data/vendor/local/include/msgpack/adaptor/cpp17/optional.hpp +16 -0
- data/vendor/local/include/msgpack/adaptor/cpp17/string_view.hpp +16 -0
- data/vendor/local/include/msgpack/adaptor/define_decl.hpp +2 -0
- data/vendor/local/include/msgpack/adaptor/ext_decl.hpp +1 -0
- data/vendor/local/include/msgpack/adaptor/fixint_decl.hpp +1 -0
- data/vendor/local/include/msgpack/adaptor/int_decl.hpp +1 -0
- data/vendor/local/include/msgpack/adaptor/map_decl.hpp +1 -0
- data/vendor/local/include/msgpack/adaptor/msgpack_tuple_decl.hpp +1 -0
- data/vendor/local/include/msgpack/adaptor/nil_decl.hpp +1 -0
- data/vendor/local/include/msgpack/adaptor/raw_decl.hpp +1 -0
- data/vendor/local/include/msgpack/adaptor/size_equal_only_decl.hpp +1 -0
- data/vendor/local/include/msgpack/adaptor/tr1/unordered_map.hpp +2 -2
- data/vendor/local/include/msgpack/adaptor/tr1/unordered_set.hpp +2 -2
- data/vendor/local/include/msgpack/adaptor/v4raw_decl.hpp +1 -0
- data/vendor/local/include/msgpack/cpp_config_decl.hpp +1 -0
- data/vendor/local/include/msgpack/create_object_visitor.hpp +17 -0
- data/vendor/local/include/msgpack/create_object_visitor_decl.hpp +16 -0
- data/vendor/local/include/msgpack/fbuffer.h +1 -1
- data/vendor/local/include/msgpack/fbuffer_decl.hpp +1 -0
- data/vendor/local/include/msgpack/gcc_atomic.hpp +0 -2
- data/vendor/local/include/msgpack/iterator_decl.hpp +2 -1
- data/vendor/local/include/msgpack/meta_decl.hpp +1 -0
- data/vendor/local/include/msgpack/null_visitor.hpp +17 -0
- data/vendor/local/include/msgpack/null_visitor_decl.hpp +16 -0
- data/vendor/local/include/msgpack/object.h +5 -0
- data/vendor/local/include/msgpack/object_decl.hpp +1 -0
- data/vendor/local/include/msgpack/object_fwd.hpp +1 -0
- data/vendor/local/include/msgpack/object_fwd_decl.hpp +1 -0
- data/vendor/local/include/msgpack/pack.h +1 -0
- data/vendor/local/include/msgpack/pack_decl.hpp +1 -0
- data/vendor/local/include/msgpack/parse.hpp +18 -0
- data/vendor/local/include/msgpack/parse_decl.hpp +16 -0
- data/vendor/local/include/msgpack/parse_return.hpp +17 -0
- data/vendor/local/include/msgpack/sbuffer_decl.hpp +1 -0
- data/vendor/local/include/msgpack/sysdep.h +34 -26
- data/vendor/local/include/msgpack/type.hpp +9 -0
- data/vendor/local/include/msgpack/unpack.h +12 -1
- data/vendor/local/include/msgpack/unpack.hpp +1 -0
- data/vendor/local/include/msgpack/unpack_decl.hpp +1 -0
- data/vendor/local/include/msgpack/unpack_exception.hpp +15 -0
- data/vendor/local/include/msgpack/unpack_template.h +22 -30
- data/vendor/local/include/msgpack/v1/adaptor/array_ref.hpp +6 -6
- data/vendor/local/include/msgpack/v1/adaptor/boost/fusion.hpp +49 -6
- data/vendor/local/include/msgpack/v1/adaptor/boost/msgpack_variant.hpp +6 -4
- data/vendor/local/include/msgpack/v1/adaptor/boost/string_view.hpp +87 -0
- data/vendor/local/include/msgpack/v1/adaptor/carray.hpp +11 -11
- data/vendor/local/include/msgpack/v1/adaptor/char_ptr.hpp +1 -1
- data/vendor/local/include/msgpack/v1/adaptor/cpp11/array.hpp +1 -1
- data/vendor/local/include/msgpack/v1/adaptor/cpp11/array_char.hpp +8 -1
- data/vendor/local/include/msgpack/v1/adaptor/cpp11/array_unsigned_char.hpp +8 -1
- data/vendor/local/include/msgpack/v1/adaptor/cpp11/forward_list.hpp +1 -1
- data/vendor/local/include/msgpack/v1/adaptor/cpp11/tuple.hpp +2 -2
- data/vendor/local/include/msgpack/v1/adaptor/cpp11/unordered_map.hpp +4 -4
- data/vendor/local/include/msgpack/v1/adaptor/cpp11/unordered_set.hpp +2 -2
- data/vendor/local/include/msgpack/v1/adaptor/cpp17/optional.hpp +90 -0
- data/vendor/local/include/msgpack/v1/adaptor/cpp17/string_view.hpp +86 -0
- data/vendor/local/include/msgpack/v1/adaptor/deque.hpp +1 -1
- data/vendor/local/include/msgpack/v1/adaptor/detail/cpp03_define_array.hpp +1088 -32
- data/vendor/local/include/msgpack/v1/adaptor/detail/cpp03_define_map.hpp +32 -16
- data/vendor/local/include/msgpack/v1/adaptor/detail/cpp03_msgpack_tuple.hpp +32 -32
- data/vendor/local/include/msgpack/v1/adaptor/detail/cpp11_convert_helper.hpp +45 -0
- data/vendor/local/include/msgpack/v1/adaptor/detail/cpp11_define_array.hpp +4 -3
- data/vendor/local/include/msgpack/v1/adaptor/detail/cpp11_define_map.hpp +4 -2
- data/vendor/local/include/msgpack/v1/adaptor/detail/cpp11_msgpack_tuple.hpp +2 -2
- data/vendor/local/include/msgpack/v1/adaptor/ext.hpp +1 -1
- data/vendor/local/include/msgpack/v1/adaptor/fixint.hpp +40 -24
- data/vendor/local/include/msgpack/v1/adaptor/float.hpp +4 -4
- data/vendor/local/include/msgpack/v1/adaptor/int.hpp +55 -33
- data/vendor/local/include/msgpack/v1/adaptor/list.hpp +1 -1
- data/vendor/local/include/msgpack/v1/adaptor/map.hpp +10 -10
- data/vendor/local/include/msgpack/v1/adaptor/pair.hpp +2 -2
- data/vendor/local/include/msgpack/v1/adaptor/set.hpp +2 -2
- data/vendor/local/include/msgpack/v1/adaptor/string.hpp +1 -1
- data/vendor/local/include/msgpack/v1/adaptor/tr1/unordered_map.hpp +2 -2
- data/vendor/local/include/msgpack/v1/adaptor/tr1/unordered_set.hpp +2 -2
- data/vendor/local/include/msgpack/v1/adaptor/vector.hpp +5 -5
- data/vendor/local/include/msgpack/v1/adaptor/vector_bool.hpp +1 -1
- data/vendor/local/include/msgpack/v1/adaptor/vector_char.hpp +9 -9
- data/vendor/local/include/msgpack/v1/adaptor/vector_unsigned_char.hpp +9 -9
- data/vendor/local/include/msgpack/v1/cpp_config.hpp +6 -0
- data/vendor/local/include/msgpack/v1/cpp_config_decl.hpp +6 -0
- data/vendor/local/include/msgpack/v1/detail/cpp03_zone.hpp +41 -34
- data/vendor/local/include/msgpack/v1/detail/cpp03_zone_decl.hpp +8 -0
- data/vendor/local/include/msgpack/v1/detail/cpp11_zone.hpp +25 -19
- data/vendor/local/include/msgpack/v1/detail/cpp11_zone_decl.hpp +8 -0
- data/vendor/local/include/msgpack/v1/meta.hpp +6 -0
- data/vendor/local/include/msgpack/v1/meta_decl.hpp +5 -0
- data/vendor/local/include/msgpack/v1/object.hpp +768 -393
- data/vendor/local/include/msgpack/v1/object_decl.hpp +11 -1
- data/vendor/local/include/msgpack/v1/object_fwd.hpp +4 -1
- data/vendor/local/include/msgpack/v1/object_fwd_decl.hpp +3 -1
- data/vendor/local/include/msgpack/v1/parse_return.hpp +36 -0
- data/vendor/local/include/msgpack/v1/unpack.hpp +39 -120
- data/vendor/local/include/msgpack/v1/unpack_decl.hpp +2 -9
- data/vendor/local/include/msgpack/v1/unpack_exception.hpp +122 -0
- data/vendor/local/include/msgpack/v1/vrefbuffer.hpp +2 -2
- data/vendor/local/include/msgpack/v2/create_object_visitor.hpp +250 -0
- data/vendor/local/include/msgpack/v2/create_object_visitor_decl.hpp +33 -0
- data/vendor/local/include/msgpack/v2/meta_decl.hpp +4 -0
- data/vendor/local/include/msgpack/v2/null_visitor.hpp +96 -0
- data/vendor/local/include/msgpack/v2/null_visitor_decl.hpp +29 -0
- data/vendor/local/include/msgpack/v2/object_decl.hpp +4 -0
- data/vendor/local/include/msgpack/v2/object_fwd.hpp +1 -1
- data/vendor/local/include/msgpack/v2/object_fwd_decl.hpp +2 -0
- data/vendor/local/include/msgpack/v2/pack_decl.hpp +1 -0
- data/vendor/local/include/msgpack/v2/parse.hpp +1072 -0
- data/vendor/local/include/msgpack/v2/parse_decl.hpp +79 -0
- data/vendor/local/include/msgpack/v2/parse_return.hpp +37 -0
- data/vendor/local/include/msgpack/v2/unpack.hpp +21 -1298
- data/vendor/local/include/msgpack/v2/unpack_decl.hpp +9 -45
- data/vendor/local/include/msgpack/v2/x3_parse.hpp +875 -0
- data/vendor/local/include/msgpack/v2/x3_parse_decl.hpp +36 -0
- data/vendor/local/include/msgpack/v2/x3_unpack.hpp +120 -0
- data/vendor/local/include/msgpack/v2/x3_unpack_decl.hpp +71 -0
- data/vendor/local/include/msgpack/v3/adaptor/adaptor_base.hpp +58 -0
- data/vendor/local/include/msgpack/v3/adaptor/adaptor_base_decl.hpp +52 -0
- data/vendor/local/include/msgpack/v3/adaptor/array_ref_decl.hpp +36 -0
- data/vendor/local/include/msgpack/v3/adaptor/boost/msgpack_variant_decl.hpp +42 -0
- data/vendor/local/include/msgpack/v3/adaptor/check_container_size_decl.hpp +39 -0
- data/vendor/local/include/msgpack/v3/adaptor/define_decl.hpp +23 -0
- data/vendor/local/include/msgpack/v3/adaptor/detail/cpp03_define_array_decl.hpp +31 -0
- data/vendor/local/include/msgpack/v3/adaptor/detail/cpp03_define_map_decl.hpp +31 -0
- data/vendor/local/include/msgpack/v3/adaptor/detail/cpp03_msgpack_tuple_decl.hpp +43 -0
- data/vendor/local/include/msgpack/v3/adaptor/detail/cpp11_define_array_decl.hpp +32 -0
- data/vendor/local/include/msgpack/v3/adaptor/detail/cpp11_define_map_decl.hpp +31 -0
- data/vendor/local/include/msgpack/v3/adaptor/detail/cpp11_msgpack_tuple_decl.hpp +59 -0
- data/vendor/local/include/msgpack/v3/adaptor/ext_decl.hpp +34 -0
- data/vendor/local/include/msgpack/v3/adaptor/fixint_decl.hpp +43 -0
- data/vendor/local/include/msgpack/v3/adaptor/int_decl.hpp +54 -0
- data/vendor/local/include/msgpack/v3/adaptor/map_decl.hpp +33 -0
- data/vendor/local/include/msgpack/v3/adaptor/msgpack_tuple_decl.hpp +21 -0
- data/vendor/local/include/msgpack/v3/adaptor/nil_decl.hpp +42 -0
- data/vendor/local/include/msgpack/v3/adaptor/raw_decl.hpp +33 -0
- data/vendor/local/include/msgpack/v3/adaptor/size_equal_only_decl.hpp +35 -0
- data/vendor/local/include/msgpack/v3/adaptor/v4raw_decl.hpp +34 -0
- data/vendor/local/include/msgpack/v3/cpp_config_decl.hpp +84 -0
- data/vendor/local/include/msgpack/v3/create_object_visitor_decl.hpp +33 -0
- data/vendor/local/include/msgpack/v3/detail/cpp03_zone_decl.hpp +31 -0
- data/vendor/local/include/msgpack/v3/detail/cpp11_zone_decl.hpp +31 -0
- data/vendor/local/include/msgpack/v3/fbuffer_decl.hpp +32 -0
- data/vendor/local/include/msgpack/v3/iterator_decl.hpp +33 -0
- data/vendor/local/include/msgpack/v3/meta_decl.hpp +50 -0
- data/vendor/local/include/msgpack/v3/null_visitor_decl.hpp +29 -0
- data/vendor/local/include/msgpack/v3/object_decl.hpp +53 -0
- data/vendor/local/include/msgpack/v3/object_fwd.hpp +70 -0
- data/vendor/local/include/msgpack/v3/object_fwd_decl.hpp +75 -0
- data/vendor/local/include/msgpack/v3/pack_decl.hpp +55 -0
- data/vendor/local/include/msgpack/v3/parse.hpp +677 -0
- data/vendor/local/include/msgpack/v3/parse_decl.hpp +49 -0
- data/vendor/local/include/msgpack/v3/parse_return.hpp +35 -0
- data/vendor/local/include/msgpack/v3/sbuffer_decl.hpp +33 -0
- data/vendor/local/include/msgpack/v3/unpack.hpp +192 -0
- data/vendor/local/include/msgpack/v3/unpack_decl.hpp +304 -0
- data/vendor/local/include/msgpack/v3/vrefbuffer_decl.hpp +29 -0
- data/vendor/local/include/msgpack/v3/x3_parse_decl.hpp +34 -0
- data/vendor/local/include/msgpack/v3/x3_unpack.hpp +97 -0
- data/vendor/local/include/msgpack/v3/x3_unpack_decl.hpp +65 -0
- data/vendor/local/include/msgpack/v3/zbuffer_decl.hpp +29 -0
- data/vendor/local/include/msgpack/v3/zone_decl.hpp +21 -0
- data/vendor/local/include/msgpack/version_master.h +2 -2
- data/vendor/local/include/msgpack/versioning.hpp +5 -3
- data/vendor/local/include/msgpack/vrefbuffer.h +1 -2
- data/vendor/local/include/msgpack/vrefbuffer_decl.hpp +1 -0
- data/vendor/local/include/msgpack/x3_parse.hpp +15 -0
- data/vendor/local/include/msgpack/x3_parse_decl.hpp +16 -0
- data/vendor/local/include/msgpack/x3_unpack.hpp +16 -0
- data/vendor/local/include/msgpack/x3_unpack_decl.hpp +16 -0
- data/vendor/local/include/msgpack/zbuffer_decl.hpp +1 -0
- data/vendor/local/include/msgpack/zone_decl.hpp +1 -0
- data/vendor/local/include/pcre.h +6 -6
- data/vendor/local/lib/cmake/msgpack/msgpack-config-version.cmake +46 -0
- data/vendor/local/lib/cmake/msgpack/msgpack-config.cmake +47 -0
- data/vendor/local/lib/cmake/msgpack/msgpack-targets-noconfig.cmake +29 -0
- data/vendor/local/lib/cmake/msgpack/msgpack-targets.cmake +101 -0
- data/vendor/local/lib/groonga/plugins/functions/index_column.a +0 -0
- data/vendor/local/lib/groonga/plugins/functions/index_column.dll +0 -0
- data/vendor/local/lib/groonga/plugins/functions/index_column.dll.a +0 -0
- data/vendor/local/lib/groonga/plugins/functions/index_column.la +1 -1
- data/vendor/local/lib/groonga/plugins/functions/math.a +0 -0
- data/vendor/local/lib/groonga/plugins/functions/math.dll +0 -0
- data/vendor/local/lib/groonga/plugins/functions/math.dll.a +0 -0
- data/vendor/local/lib/groonga/plugins/functions/math.la +1 -1
- data/vendor/local/lib/groonga/plugins/functions/number.a +0 -0
- data/vendor/local/lib/groonga/plugins/functions/number.dll +0 -0
- data/vendor/local/lib/groonga/plugins/functions/number.dll.a +0 -0
- data/vendor/local/lib/groonga/plugins/functions/number.la +1 -1
- data/vendor/local/lib/groonga/plugins/functions/string.a +0 -0
- data/vendor/local/lib/groonga/plugins/functions/string.dll +0 -0
- data/vendor/local/lib/groonga/plugins/functions/string.dll.a +0 -0
- data/vendor/local/lib/groonga/plugins/functions/string.la +1 -1
- data/vendor/local/lib/groonga/plugins/functions/time.a +0 -0
- data/vendor/local/lib/groonga/plugins/functions/time.dll +0 -0
- data/vendor/local/lib/groonga/plugins/functions/time.dll.a +0 -0
- data/vendor/local/lib/groonga/plugins/functions/time.la +1 -1
- data/vendor/local/lib/groonga/plugins/functions/vector.a +0 -0
- data/vendor/local/lib/groonga/plugins/functions/vector.dll +0 -0
- data/vendor/local/lib/groonga/plugins/functions/vector.dll.a +0 -0
- data/vendor/local/lib/groonga/plugins/functions/vector.la +1 -1
- data/vendor/local/lib/groonga/plugins/normalizers/mysql.a +0 -0
- data/vendor/local/lib/groonga/plugins/normalizers/mysql.dll +0 -0
- data/vendor/local/lib/groonga/plugins/normalizers/mysql.dll.a +0 -0
- data/vendor/local/lib/groonga/plugins/normalizers/mysql.la +2 -2
- data/vendor/local/lib/groonga/plugins/query_expanders/tsv.a +0 -0
- data/vendor/local/lib/groonga/plugins/query_expanders/tsv.dll +0 -0
- data/vendor/local/lib/groonga/plugins/query_expanders/tsv.dll.a +0 -0
- data/vendor/local/lib/groonga/plugins/query_expanders/tsv.la +1 -1
- data/vendor/local/lib/groonga/plugins/sharding/dynamic_columns.rb +150 -19
- data/vendor/local/lib/groonga/plugins/sharding/logical_count.rb +123 -65
- data/vendor/local/lib/groonga/plugins/sharding/logical_range_filter.rb +528 -113
- data/vendor/local/lib/groonga/plugins/sharding/logical_select.rb +142 -40
- data/vendor/local/lib/groonga/plugins/suggest/suggest.a +0 -0
- data/vendor/local/lib/groonga/plugins/suggest/suggest.dll +0 -0
- data/vendor/local/lib/groonga/plugins/suggest/suggest.dll.a +0 -0
- data/vendor/local/lib/groonga/plugins/suggest/suggest.la +1 -1
- data/vendor/local/lib/groonga/plugins/token_filters/stop_word.a +0 -0
- data/vendor/local/lib/groonga/plugins/token_filters/stop_word.dll +0 -0
- data/vendor/local/lib/groonga/plugins/token_filters/stop_word.dll.a +0 -0
- data/vendor/local/lib/groonga/plugins/token_filters/stop_word.la +1 -1
- data/vendor/local/lib/groonga/plugins/tokenizers/mecab.a +0 -0
- data/vendor/local/lib/groonga/plugins/tokenizers/mecab.dll +0 -0
- data/vendor/local/lib/groonga/plugins/tokenizers/mecab.dll.a +0 -0
- data/vendor/local/lib/groonga/plugins/tokenizers/mecab.la +1 -1
- data/vendor/local/lib/groonga/scripts/ruby/backtrace_entry.rb +1 -1
- data/vendor/local/lib/groonga/scripts/ruby/command_line/grndb.rb +64 -35
- data/vendor/local/lib/groonga/scripts/ruby/expression.rb +3 -1
- data/vendor/local/lib/groonga/scripts/ruby/expression_rewriters.rb +15 -21
- data/vendor/local/lib/groonga/scripts/ruby/expression_rewriters/optimizer.rb +274 -0
- data/vendor/local/lib/groonga/scripts/ruby/expression_tree.rb +8 -2
- data/vendor/local/lib/groonga/scripts/ruby/expression_tree/assign.rb +22 -0
- data/vendor/local/lib/groonga/scripts/ruby/expression_tree/assign_binary_operation.rb +24 -0
- data/vendor/local/lib/groonga/scripts/ruby/expression_tree/binary_operation.rb +206 -8
- data/vendor/local/lib/groonga/scripts/ruby/expression_tree/constant.rb +16 -1
- data/vendor/local/lib/groonga/scripts/ruby/expression_tree/function_call.rb +30 -1
- data/vendor/local/lib/groonga/scripts/ruby/expression_tree/logical_operation.rb +6 -0
- data/vendor/local/lib/groonga/scripts/ruby/expression_tree/member.rb +18 -0
- data/vendor/local/lib/groonga/scripts/ruby/expression_tree/null.rb +17 -0
- data/vendor/local/lib/groonga/scripts/ruby/expression_tree/reference.rb +18 -0
- data/vendor/local/lib/groonga/scripts/ruby/expression_tree/table.rb +14 -0
- data/vendor/local/lib/groonga/scripts/ruby/expression_tree/unary_operation.rb +26 -0
- data/vendor/local/lib/groonga/scripts/ruby/expression_tree/variable.rb +4 -0
- data/vendor/local/lib/groonga/scripts/ruby/expression_tree_builder.rb +78 -8
- data/vendor/local/lib/groonga/scripts/ruby/index_column.rb +10 -0
- data/vendor/local/lib/groonga/scripts/ruby/initialize/post.rb +2 -0
- data/vendor/local/lib/groonga/scripts/ruby/locale_output.rb +28 -0
- data/vendor/local/lib/groonga/scripts/ruby/logger.rb +36 -4
- data/vendor/local/lib/groonga/scripts/ruby/record.rb +1 -1
- data/vendor/local/lib/groonga/scripts/ruby/scan_info_builder.rb +0 -3
- data/vendor/local/lib/groonga/scripts/ruby/scan_info_data.rb +46 -5
- data/vendor/local/lib/groonga/scripts/ruby/scan_info_data_size_estimator.rb +5 -136
- data/vendor/local/lib/groonga/scripts/ruby/table.rb +2 -2
- data/vendor/local/lib/libgroonga.a +0 -0
- data/vendor/local/lib/libgroonga.dll.a +0 -0
- data/vendor/local/lib/libgroonga.la +1 -1
- data/vendor/local/lib/liblz4.a +0 -0
- data/vendor/local/lib/liblz4.dll +0 -0
- data/vendor/local/lib/liblz4.dll.1 +0 -0
- data/vendor/local/lib/{liblz4.dll.1.5.0 → liblz4.dll.1.8.2} +0 -0
- data/vendor/local/lib/libmecab.dll.a +0 -0
- data/vendor/local/lib/libmsgpackc.a +0 -0
- data/vendor/local/lib/libmsgpackc.dll.a +0 -0
- data/vendor/local/lib/libonigmo.a +0 -0
- data/vendor/local/lib/libonigmo.dll.a +0 -0
- data/vendor/local/lib/libpcre.a +0 -0
- data/vendor/local/lib/libpcre.dll.a +0 -0
- data/vendor/local/lib/libpcre.la +2 -2
- data/vendor/local/lib/libpcrecpp.dll.a +0 -0
- data/vendor/local/lib/libpcrecpp.la +1 -1
- data/vendor/local/lib/libpcreposix.a +0 -0
- data/vendor/local/lib/libpcreposix.dll.a +0 -0
- data/vendor/local/lib/libpcreposix.la +2 -2
- data/vendor/local/lib/libz.dll.a +0 -0
- data/vendor/local/lib/pkgconfig/groonga-normalizer-mysql.pc +1 -1
- data/vendor/local/lib/pkgconfig/groonga.pc +2 -2
- data/vendor/local/lib/pkgconfig/liblz4.pc +3 -3
- data/vendor/local/lib/pkgconfig/libpcre.pc +1 -1
- data/vendor/local/lib/pkgconfig/libpcrecpp.pc +1 -1
- data/vendor/local/lib/pkgconfig/libpcreposix.pc +1 -1
- data/vendor/local/lib/pkgconfig/msgpack.pc +1 -1
- data/vendor/local/libexec/mecab/mecab-cost-train.exe +0 -0
- data/vendor/local/libexec/mecab/mecab-dict-gen.exe +0 -0
- data/vendor/local/libexec/mecab/mecab-dict-index.exe +0 -0
- data/vendor/local/libexec/mecab/mecab-system-eval.exe +0 -0
- data/vendor/local/libexec/mecab/mecab-test-gen.exe +0 -0
- data/vendor/local/share/doc/groonga-normalizer-mysql/README.md +14 -22
- data/vendor/local/share/doc/groonga-normalizer-mysql/news.md +22 -2
- data/vendor/local/share/doc/groonga/en/html/.buildinfo +1 -1
- data/vendor/local/share/doc/groonga/en/html/_static/basic.css +113 -4
- data/vendor/local/share/doc/groonga/en/html/_static/doctools.js +46 -19
- data/vendor/local/share/doc/groonga/en/html/_static/documentation_options.js +10 -0
- data/vendor/local/share/doc/groonga/en/html/_static/{jquery-3.1.0.js → jquery-3.2.1.js} +474 -295
- data/vendor/local/share/doc/groonga/en/html/_static/jquery.js +4 -4
- data/vendor/local/share/doc/groonga/en/html/_static/language_data.js +297 -0
- data/vendor/local/share/doc/groonga/en/html/_static/pygments.css +4 -0
- data/vendor/local/share/doc/groonga/en/html/_static/searchtools.js +69 -322
- data/vendor/local/share/doc/groonga/en/html/characteristic.html +16 -24
- data/vendor/local/share/doc/groonga/en/html/client.html +15 -23
- data/vendor/local/share/doc/groonga/en/html/community.html +30 -38
- data/vendor/local/share/doc/groonga/en/html/contribution.html +23 -31
- data/vendor/local/share/doc/groonga/en/html/contribution/development.html +15 -23
- data/vendor/local/share/doc/groonga/en/html/contribution/development/build.html +15 -23
- data/vendor/local/share/doc/groonga/en/html/contribution/development/build/unix_autotools.html +58 -66
- data/vendor/local/share/doc/groonga/en/html/contribution/development/build/unix_cmake.html +51 -56
- data/vendor/local/share/doc/groonga/en/html/contribution/development/build/windows_cmake.html +52 -56
- data/vendor/local/share/doc/groonga/en/html/contribution/development/com.html +27 -35
- data/vendor/local/share/doc/groonga/en/html/contribution/development/cooperation.html +19 -27
- data/vendor/local/share/doc/groonga/en/html/contribution/development/query.html +26 -34
- data/vendor/local/share/doc/groonga/en/html/contribution/development/release.html +167 -167
- data/vendor/local/share/doc/groonga/en/html/contribution/development/repository.html +16 -24
- data/vendor/local/share/doc/groonga/en/html/contribution/development/test.html +28 -36
- data/vendor/local/share/doc/groonga/en/html/contribution/documentation.html +15 -23
- data/vendor/local/share/doc/groonga/en/html/contribution/documentation/c-api.html +15 -23
- data/vendor/local/share/doc/groonga/en/html/contribution/documentation/i18n.html +59 -67
- data/vendor/local/share/doc/groonga/en/html/contribution/documentation/introduction.html +31 -39
- data/vendor/local/share/doc/groonga/en/html/contribution/report.html +18 -26
- data/vendor/local/share/doc/groonga/en/html/development.html +15 -23
- data/vendor/local/share/doc/groonga/en/html/development/travis-ci.html +38 -43
- data/vendor/local/share/doc/groonga/en/html/genindex.html +50 -28
- data/vendor/local/share/doc/groonga/en/html/index.html +248 -234
- data/vendor/local/share/doc/groonga/en/html/install.html +43 -47
- data/vendor/local/share/doc/groonga/en/html/install/centos.html +43 -51
- data/vendor/local/share/doc/groonga/en/html/install/debian.html +52 -131
- data/vendor/local/share/doc/groonga/en/html/install/docker.html +155 -0
- data/vendor/local/share/doc/groonga/en/html/install/fedora.html +41 -49
- data/vendor/local/share/doc/groonga/en/html/install/mac_os_x.html +29 -37
- data/vendor/local/share/doc/groonga/en/html/install/others.html +142 -150
- data/vendor/local/share/doc/groonga/en/html/install/solaris.html +30 -38
- data/vendor/local/share/doc/groonga/en/html/install/ubuntu.html +43 -51
- data/vendor/local/share/doc/groonga/en/html/install/windows.html +33 -41
- data/vendor/local/share/doc/groonga/en/html/limitations.html +36 -42
- data/vendor/local/share/doc/groonga/en/html/news.html +1586 -598
- data/vendor/local/share/doc/groonga/en/html/news/0.x.html +83 -83
- data/vendor/local/share/doc/groonga/en/html/news/1.0.x.html +147 -155
- data/vendor/local/share/doc/groonga/en/html/news/1.1.x.html +26 -34
- data/vendor/local/share/doc/groonga/en/html/news/1.2.x.html +225 -233
- data/vendor/local/share/doc/groonga/en/html/news/1.3.x.html +48 -56
- data/vendor/local/share/doc/groonga/en/html/news/2.x.html +378 -386
- data/vendor/local/share/doc/groonga/en/html/news/3.x.html +320 -328
- data/vendor/local/share/doc/groonga/en/html/news/4.x.html +442 -448
- data/vendor/local/share/doc/groonga/en/html/news/5.x.html +742 -860
- data/vendor/local/share/doc/groonga/en/html/news/6.x.html +544 -621
- data/vendor/local/share/doc/groonga/en/html/news/senna.html +32 -40
- data/vendor/local/share/doc/groonga/en/html/objects.inv +0 -0
- data/vendor/local/share/doc/groonga/en/html/reference.html +208 -198
- data/vendor/local/share/doc/groonga/en/html/reference/alias.html +85 -93
- data/vendor/local/share/doc/groonga/en/html/reference/api.html +50 -57
- data/vendor/local/share/doc/groonga/en/html/reference/api/global_configurations.html +62 -77
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_cache.html +117 -149
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_column.html +140 -176
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_command_version.html +43 -55
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_content_type.html +48 -56
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_ctx.html +194 -254
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_db.html +106 -138
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_encoding.html +62 -82
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_expr.html +117 -137
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_geo.html +74 -98
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_hook.html +79 -103
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_ii.html +40 -48
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_index_cursor.html +57 -73
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_info.html +75 -99
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_inspect.html +495 -0
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_match_escalation.html +52 -68
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_obj.html +291 -357
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_proc.html +69 -89
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_search.html +47 -59
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_table.html +226 -306
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_table_cursor.html +120 -160
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_thread.html +80 -103
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_type.html +46 -58
- data/vendor/local/share/doc/groonga/en/html/reference/api/grn_user_data.html +40 -52
- data/vendor/local/share/doc/groonga/en/html/reference/api/overview.html +52 -66
- data/vendor/local/share/doc/groonga/en/html/reference/api/plugin.html +98 -122
- data/vendor/local/share/doc/groonga/en/html/reference/cast.html +40 -26
- data/vendor/local/share/doc/groonga/en/html/reference/column.html +16 -24
- data/vendor/local/share/doc/groonga/en/html/reference/columns/index.html +16 -24
- data/vendor/local/share/doc/groonga/en/html/reference/columns/pseudo.html +30 -34
- data/vendor/local/share/doc/groonga/en/html/reference/columns/scalar.html +16 -24
- data/vendor/local/share/doc/groonga/en/html/reference/columns/vector.html +92 -100
- data/vendor/local/share/doc/groonga/en/html/reference/command.html +76 -84
- data/vendor/local/share/doc/groonga/en/html/reference/command/command_version.html +26 -34
- data/vendor/local/share/doc/groonga/en/html/reference/command/output_format.html +64 -72
- data/vendor/local/share/doc/groonga/en/html/reference/command/pretty_print.html +21 -29
- data/vendor/local/share/doc/groonga/en/html/reference/command/request_id.html +25 -33
- data/vendor/local/share/doc/groonga/en/html/reference/command/request_timeout.html +32 -40
- data/vendor/local/share/doc/groonga/en/html/reference/command/return_code.html +105 -113
- data/vendor/local/share/doc/groonga/en/html/reference/commands/cache_limit.html +44 -50
- data/vendor/local/share/doc/groonga/en/html/reference/commands/check.html +85 -73
- data/vendor/local/share/doc/groonga/en/html/reference/commands/clearlock.html +31 -37
- data/vendor/local/share/doc/groonga/en/html/reference/commands/column_copy.html +131 -139
- data/vendor/local/share/doc/groonga/en/html/reference/commands/column_create.html +370 -326
- data/vendor/local/share/doc/groonga/en/html/reference/commands/column_list.html +115 -117
- data/vendor/local/share/doc/groonga/en/html/reference/commands/column_remove.html +38 -44
- data/vendor/local/share/doc/groonga/en/html/reference/commands/column_rename.html +47 -53
- data/vendor/local/share/doc/groonga/en/html/reference/commands/config_delete.html +40 -48
- data/vendor/local/share/doc/groonga/en/html/reference/commands/config_get.html +42 -50
- data/vendor/local/share/doc/groonga/en/html/reference/commands/config_set.html +41 -49
- data/vendor/local/share/doc/groonga/en/html/reference/commands/database_unmap.html +37 -45
- data/vendor/local/share/doc/groonga/en/html/reference/commands/define_selector.html +71 -63
- data/vendor/local/share/doc/groonga/en/html/reference/commands/defrag.html +31 -37
- data/vendor/local/share/doc/groonga/en/html/reference/commands/delete.html +49 -51
- data/vendor/local/share/doc/groonga/en/html/reference/commands/dump.html +64 -71
- data/vendor/local/share/doc/groonga/en/html/reference/commands/io_flush.html +335 -138
- data/vendor/local/share/doc/groonga/en/html/reference/commands/load.html +233 -87
- data/vendor/local/share/doc/groonga/en/html/reference/commands/lock_acquire.html +45 -53
- data/vendor/local/share/doc/groonga/en/html/reference/commands/lock_clear.html +42 -48
- data/vendor/local/share/doc/groonga/en/html/reference/commands/lock_release.html +43 -51
- data/vendor/local/share/doc/groonga/en/html/reference/commands/log_level.html +58 -64
- data/vendor/local/share/doc/groonga/en/html/reference/commands/log_put.html +33 -38
- data/vendor/local/share/doc/groonga/en/html/reference/commands/log_reopen.html +31 -38
- data/vendor/local/share/doc/groonga/en/html/reference/commands/logical_count.html +295 -218
- data/vendor/local/share/doc/groonga/en/html/reference/commands/logical_parameters.html +56 -64
- data/vendor/local/share/doc/groonga/en/html/reference/commands/logical_range_filter.html +532 -214
- data/vendor/local/share/doc/groonga/en/html/reference/commands/logical_select.html +797 -388
- data/vendor/local/share/doc/groonga/en/html/reference/commands/logical_shard_list.html +35 -43
- data/vendor/local/share/doc/groonga/en/html/reference/commands/logical_table_remove.html +188 -196
- data/vendor/local/share/doc/groonga/en/html/reference/commands/normalize.html +83 -90
- data/vendor/local/share/doc/groonga/en/html/reference/commands/normalizer_list.html +41 -48
- data/vendor/local/share/doc/groonga/en/html/reference/commands/object_exist.html +41 -49
- data/vendor/local/share/doc/groonga/en/html/reference/commands/object_inspect.html +401 -403
- data/vendor/local/share/doc/groonga/en/html/reference/commands/object_list.html +253 -261
- data/vendor/local/share/doc/groonga/en/html/reference/commands/object_remove.html +60 -68
- data/vendor/local/share/doc/groonga/en/html/reference/commands/plugin_register.html +36 -44
- data/vendor/local/share/doc/groonga/en/html/reference/commands/plugin_unregister.html +35 -43
- data/vendor/local/share/doc/groonga/en/html/reference/commands/query_expand.html +21 -29
- data/vendor/local/share/doc/groonga/en/html/reference/commands/quit.html +22 -30
- data/vendor/local/share/doc/groonga/en/html/reference/commands/range_filter.html +21 -29
- data/vendor/local/share/doc/groonga/en/html/reference/commands/register.html +39 -47
- data/vendor/local/share/doc/groonga/en/html/reference/commands/reindex.html +47 -53
- data/vendor/local/share/doc/groonga/en/html/reference/commands/request_cancel.html +72 -74
- data/vendor/local/share/doc/groonga/en/html/reference/commands/ruby_eval.html +38 -45
- data/vendor/local/share/doc/groonga/en/html/reference/commands/ruby_load.html +38 -45
- data/vendor/local/share/doc/groonga/en/html/reference/commands/schema.html +330 -338
- data/vendor/local/share/doc/groonga/en/html/reference/commands/select.html +1545 -1194
- data/vendor/local/share/doc/groonga/en/html/reference/commands/shutdown.html +57 -65
- data/vendor/local/share/doc/groonga/en/html/reference/commands/status.html +83 -91
- data/vendor/local/share/doc/groonga/en/html/reference/commands/suggest.html +119 -133
- data/vendor/local/share/doc/groonga/en/html/reference/commands/table_copy.html +30 -38
- data/vendor/local/share/doc/groonga/en/html/reference/commands/table_create.html +165 -174
- data/vendor/local/share/doc/groonga/en/html/reference/commands/table_list.html +50 -50
- data/vendor/local/share/doc/groonga/en/html/reference/commands/table_remove.html +104 -112
- data/vendor/local/share/doc/groonga/en/html/reference/commands/table_rename.html +42 -50
- data/vendor/local/share/doc/groonga/en/html/reference/commands/table_tokenize.html +49 -57
- data/vendor/local/share/doc/groonga/en/html/reference/commands/thread_limit.html +46 -54
- data/vendor/local/share/doc/groonga/en/html/reference/commands/tokenize.html +110 -117
- data/vendor/local/share/doc/groonga/en/html/reference/commands/tokenizer_list.html +41 -48
- data/vendor/local/share/doc/groonga/en/html/reference/commands/truncate.html +40 -46
- data/vendor/local/share/doc/groonga/en/html/reference/configuration.html +37 -45
- data/vendor/local/share/doc/groonga/en/html/reference/executables.html +19 -27
- data/vendor/local/share/doc/groonga/en/html/reference/executables/grndb.html +134 -114
- data/vendor/local/share/doc/groonga/en/html/reference/executables/grnslap.html +25 -31
- data/vendor/local/share/doc/groonga/en/html/reference/executables/groonga-benchmark.html +66 -66
- data/vendor/local/share/doc/groonga/en/html/reference/executables/groonga-httpd.html +174 -182
- data/vendor/local/share/doc/groonga/en/html/reference/executables/groonga-server-http.html +25 -33
- data/vendor/local/share/doc/groonga/en/html/reference/executables/groonga-suggest-create-dataset.html +27 -35
- data/vendor/local/share/doc/groonga/en/html/reference/executables/groonga-suggest-httpd.html +191 -199
- data/vendor/local/share/doc/groonga/en/html/reference/executables/groonga-suggest-learner.html +32 -40
- data/vendor/local/share/doc/groonga/en/html/reference/executables/groonga.html +189 -163
- data/vendor/local/share/doc/groonga/en/html/reference/function.html +59 -64
- data/vendor/local/share/doc/groonga/en/html/reference/functions/between.html +71 -79
- data/vendor/local/share/doc/groonga/en/html/reference/functions/cast_loose.html +210 -0
- data/vendor/local/share/doc/groonga/en/html/reference/functions/edit_distance.html +49 -55
- data/vendor/local/share/doc/groonga/en/html/reference/functions/fuzzy_search.html +38 -46
- data/vendor/local/share/doc/groonga/en/html/reference/functions/geo_distance.html +133 -142
- data/vendor/local/share/doc/groonga/en/html/reference/functions/geo_in_circle.html +67 -73
- data/vendor/local/share/doc/groonga/en/html/reference/functions/geo_in_rectangle.html +56 -62
- data/vendor/local/share/doc/groonga/en/html/reference/functions/highlight_full.html +80 -88
- data/vendor/local/share/doc/groonga/en/html/reference/functions/highlight_html.html +70 -78
- data/vendor/local/share/doc/groonga/en/html/reference/functions/html_untag.html +56 -64
- data/vendor/local/share/doc/groonga/en/html/reference/functions/in_records.html +87 -94
- data/vendor/local/share/doc/groonga/en/html/reference/functions/in_values.html +54 -62
- data/vendor/local/share/doc/groonga/en/html/reference/functions/math_abs.html +55 -63
- data/vendor/local/share/doc/groonga/en/html/reference/functions/now.html +40 -48
- data/vendor/local/share/doc/groonga/en/html/reference/functions/number_classify.html +36 -44
- data/vendor/local/share/doc/groonga/en/html/reference/functions/prefix_rk_search.html +74 -82
- data/vendor/local/share/doc/groonga/en/html/reference/functions/query.html +152 -160
- data/vendor/local/share/doc/groonga/en/html/reference/functions/rand.html +45 -52
- data/vendor/local/share/doc/groonga/en/html/reference/functions/snippet_html.html +76 -84
- data/vendor/local/share/doc/groonga/en/html/reference/functions/string_length.html +37 -45
- data/vendor/local/share/doc/groonga/en/html/reference/functions/string_substring.html +39 -47
- data/vendor/local/share/doc/groonga/en/html/reference/functions/sub_filter.html +76 -84
- data/vendor/local/share/doc/groonga/en/html/reference/functions/time_classify_day.html +37 -45
- data/vendor/local/share/doc/groonga/en/html/reference/functions/time_classify_day_of_week.html +278 -0
- data/vendor/local/share/doc/groonga/en/html/reference/functions/time_classify_hour.html +37 -45
- data/vendor/local/share/doc/groonga/en/html/reference/functions/time_classify_minute.html +36 -44
- data/vendor/local/share/doc/groonga/en/html/reference/functions/time_classify_month.html +36 -44
- data/vendor/local/share/doc/groonga/en/html/reference/functions/time_classify_second.html +36 -44
- data/vendor/local/share/doc/groonga/en/html/reference/functions/time_classify_week.html +36 -44
- data/vendor/local/share/doc/groonga/en/html/reference/functions/time_classify_year.html +37 -45
- data/vendor/local/share/doc/groonga/en/html/reference/functions/vector_find.html +368 -0
- data/vendor/local/share/doc/groonga/en/html/reference/functions/vector_new.html +40 -48
- data/vendor/local/share/doc/groonga/en/html/reference/functions/vector_size.html +54 -62
- data/vendor/local/share/doc/groonga/en/html/reference/functions/vector_slice.html +40 -47
- data/vendor/local/share/doc/groonga/en/html/reference/grn_expr.html +44 -52
- data/vendor/local/share/doc/groonga/en/html/reference/grn_expr/query_syntax.html +307 -316
- data/vendor/local/share/doc/groonga/en/html/reference/grn_expr/script_syntax.html +486 -492
- data/vendor/local/share/doc/groonga/en/html/reference/indexing.html +44 -52
- data/vendor/local/share/doc/groonga/en/html/reference/log.html +128 -147
- data/vendor/local/share/doc/groonga/en/html/reference/normalizers.html +43 -92
- data/vendor/local/share/doc/groonga/en/html/reference/normalizers/normalizer_auto.html +179 -0
- data/vendor/local/share/doc/groonga/en/html/reference/normalizers/normalizer_nfkc100.html +897 -0
- data/vendor/local/share/doc/groonga/en/html/reference/normalizers/normalizer_nfkc51.html +162 -0
- data/vendor/local/share/doc/groonga/en/html/reference/operations.html +26 -34
- data/vendor/local/share/doc/groonga/en/html/reference/operations/geolocation_search.html +48 -56
- data/vendor/local/share/doc/groonga/en/html/reference/operations/prefix_rk_search.html +47 -55
- data/vendor/local/share/doc/groonga/en/html/reference/output.html +47 -55
- data/vendor/local/share/doc/groonga/en/html/reference/query_expanders.html +20 -28
- data/vendor/local/share/doc/groonga/en/html/reference/query_expanders/tsv.html +93 -101
- data/vendor/local/share/doc/groonga/en/html/reference/regular_expression.html +228 -225
- data/vendor/local/share/doc/groonga/en/html/reference/scorer.html +59 -67
- data/vendor/local/share/doc/groonga/en/html/reference/scorers/scorer_tf_at_most.html +50 -58
- data/vendor/local/share/doc/groonga/en/html/reference/scorers/scorer_tf_idf.html +57 -65
- data/vendor/local/share/doc/groonga/en/html/reference/sharding.html +76 -86
- data/vendor/local/share/doc/groonga/en/html/reference/suggest.html +43 -51
- data/vendor/local/share/doc/groonga/en/html/reference/suggest/completion.html +159 -167
- data/vendor/local/share/doc/groonga/en/html/reference/suggest/correction.html +93 -101
- data/vendor/local/share/doc/groonga/en/html/reference/suggest/introduction.html +85 -93
- data/vendor/local/share/doc/groonga/en/html/reference/suggest/suggestion.html +88 -96
- data/vendor/local/share/doc/groonga/en/html/reference/tables.html +142 -150
- data/vendor/local/share/doc/groonga/en/html/reference/token_filter/summary.html +147 -0
- data/vendor/local/share/doc/groonga/en/html/reference/token_filters.html +31 -223
- data/vendor/local/share/doc/groonga/en/html/reference/token_filters/token_filter_nfkc100.html +626 -0
- data/vendor/local/share/doc/groonga/en/html/reference/token_filters/token_filter_stem.html +291 -0
- data/vendor/local/share/doc/groonga/en/html/reference/token_filters/token_filter_stop_word.html +287 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizer/summary.html +259 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizers.html +42 -1455
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizers/token_bigram.html +368 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizers/token_bigram_ignore_blank.html +221 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizers/token_bigram_ignore_blank_split_symbol.html +240 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizers/token_bigram_ignore_blank_split_symbol_alpha.html +270 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizers/token_bigram_ignore_blank_split_symbol_alpha_digit.html +292 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizers/token_bigram_split_symbol.html +179 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizers/token_bigram_split_symbol_alpha.html +200 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizers/token_bigram_split_symbol_alpha_digit.html +212 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizers/token_delimit.html +357 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizers/token_delimit_null.html +162 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizers/token_mecab.html +783 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizers/token_regexp.html +289 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizers/token_trigram.html +194 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tokenizers/token_unigram.html +194 -0
- data/vendor/local/share/doc/groonga/en/html/reference/tuning.html +71 -79
- data/vendor/local/share/doc/groonga/en/html/reference/types.html +64 -72
- data/vendor/local/share/doc/groonga/en/html/reference/window_function.html +29 -37
- data/vendor/local/share/doc/groonga/en/html/reference/window_functions/record_number.html +38 -46
- data/vendor/local/share/doc/groonga/en/html/reference/window_functions/window_count.html +38 -46
- data/vendor/local/share/doc/groonga/en/html/reference/window_functions/window_record_number.html +38 -46
- data/vendor/local/share/doc/groonga/en/html/reference/window_functions/window_sum.html +38 -46
- data/vendor/local/share/doc/groonga/en/html/search.html +13 -24
- data/vendor/local/share/doc/groonga/en/html/searchindex.js +1 -1
- data/vendor/local/share/doc/groonga/en/html/server.html +15 -23
- data/vendor/local/share/doc/groonga/en/html/server/gqtp.html +27 -35
- data/vendor/local/share/doc/groonga/en/html/server/http.html +18 -26
- data/vendor/local/share/doc/groonga/en/html/server/http/comparison.html +94 -102
- data/vendor/local/share/doc/groonga/en/html/server/http/groonga-httpd.html +15 -23
- data/vendor/local/share/doc/groonga/en/html/server/http/groonga.html +15 -23
- data/vendor/local/share/doc/groonga/en/html/server/memcached.html +18 -26
- data/vendor/local/share/doc/groonga/en/html/server/package.html +101 -109
- data/vendor/local/share/doc/groonga/en/html/spec.html +19 -27
- data/vendor/local/share/doc/groonga/en/html/spec/gqtp.html +207 -215
- data/vendor/local/share/doc/groonga/en/html/spec/search.html +39 -39
- data/vendor/local/share/doc/groonga/en/html/troubleshooting.html +15 -23
- data/vendor/local/share/doc/groonga/en/html/troubleshooting/different_results_with_the_same_keyword.html +46 -50
- data/vendor/local/share/doc/groonga/en/html/troubleshooting/how_to_analyze_error_message.html +27 -35
- data/vendor/local/share/doc/groonga/en/html/troubleshooting/mmap_cannot_allocate_memory.html +26 -31
- data/vendor/local/share/doc/groonga/en/html/tutorial.html +17 -25
- data/vendor/local/share/doc/groonga/en/html/tutorial/data.html +46 -54
- data/vendor/local/share/doc/groonga/en/html/tutorial/drilldown.html +63 -71
- data/vendor/local/share/doc/groonga/en/html/tutorial/index.html +30 -38
- data/vendor/local/share/doc/groonga/en/html/tutorial/introduction.html +88 -97
- data/vendor/local/share/doc/groonga/en/html/tutorial/lexicon.html +19 -27
- data/vendor/local/share/doc/groonga/en/html/tutorial/match_columns.html +61 -69
- data/vendor/local/share/doc/groonga/en/html/tutorial/micro_blog.html +108 -116
- data/vendor/local/share/doc/groonga/en/html/tutorial/network.html +24 -32
- data/vendor/local/share/doc/groonga/en/html/tutorial/patricia_trie.html +19 -27
- data/vendor/local/share/doc/groonga/en/html/tutorial/query_expansion.html +32 -40
- data/vendor/local/share/doc/groonga/en/html/tutorial/search.html +52 -60
- data/vendor/local/share/doc/groonga/ja/html/.buildinfo +1 -1
- data/vendor/local/share/doc/groonga/ja/html/_static/basic.css +113 -4
- data/vendor/local/share/doc/groonga/ja/html/_static/doctools.js +46 -19
- data/vendor/local/share/doc/groonga/ja/html/_static/documentation_options.js +10 -0
- data/vendor/local/share/doc/groonga/ja/html/_static/{jquery-3.1.0.js → jquery-3.2.1.js} +474 -295
- data/vendor/local/share/doc/groonga/ja/html/_static/jquery.js +4 -4
- data/vendor/local/share/doc/groonga/ja/html/_static/language_data.js +124 -0
- data/vendor/local/share/doc/groonga/ja/html/_static/pygments.css +4 -0
- data/vendor/local/share/doc/groonga/ja/html/_static/searchtools.js +70 -150
- data/vendor/local/share/doc/groonga/ja/html/characteristic.html +15 -23
- data/vendor/local/share/doc/groonga/ja/html/client.html +15 -23
- data/vendor/local/share/doc/groonga/ja/html/community.html +29 -37
- data/vendor/local/share/doc/groonga/ja/html/contribution.html +23 -31
- data/vendor/local/share/doc/groonga/ja/html/contribution/development.html +15 -23
- data/vendor/local/share/doc/groonga/ja/html/contribution/development/build.html +15 -23
- data/vendor/local/share/doc/groonga/ja/html/contribution/development/build/unix_autotools.html +50 -58
- data/vendor/local/share/doc/groonga/ja/html/contribution/development/build/unix_cmake.html +43 -48
- data/vendor/local/share/doc/groonga/ja/html/contribution/development/build/windows_cmake.html +47 -51
- data/vendor/local/share/doc/groonga/ja/html/contribution/development/com.html +26 -34
- data/vendor/local/share/doc/groonga/ja/html/contribution/development/cooperation.html +18 -26
- data/vendor/local/share/doc/groonga/ja/html/contribution/development/query.html +23 -31
- data/vendor/local/share/doc/groonga/ja/html/contribution/development/release.html +162 -162
- data/vendor/local/share/doc/groonga/ja/html/contribution/development/repository.html +16 -24
- data/vendor/local/share/doc/groonga/ja/html/contribution/development/test.html +26 -34
- data/vendor/local/share/doc/groonga/ja/html/contribution/documentation.html +15 -23
- data/vendor/local/share/doc/groonga/ja/html/contribution/documentation/c-api.html +15 -23
- data/vendor/local/share/doc/groonga/ja/html/contribution/documentation/i18n.html +50 -58
- data/vendor/local/share/doc/groonga/ja/html/contribution/documentation/introduction.html +28 -36
- data/vendor/local/share/doc/groonga/ja/html/contribution/report.html +17 -25
- data/vendor/local/share/doc/groonga/ja/html/development.html +15 -23
- data/vendor/local/share/doc/groonga/ja/html/development/travis-ci.html +32 -37
- data/vendor/local/share/doc/groonga/ja/html/genindex.html +50 -28
- data/vendor/local/share/doc/groonga/ja/html/index.html +247 -233
- data/vendor/local/share/doc/groonga/ja/html/install.html +41 -45
- data/vendor/local/share/doc/groonga/ja/html/install/centos.html +44 -52
- data/vendor/local/share/doc/groonga/ja/html/install/debian.html +52 -121
- data/vendor/local/share/doc/groonga/ja/html/install/docker.html +155 -0
- data/vendor/local/share/doc/groonga/ja/html/install/fedora.html +40 -48
- data/vendor/local/share/doc/groonga/ja/html/install/mac_os_x.html +28 -36
- data/vendor/local/share/doc/groonga/ja/html/install/others.html +116 -124
- data/vendor/local/share/doc/groonga/ja/html/install/solaris.html +28 -36
- data/vendor/local/share/doc/groonga/ja/html/install/ubuntu.html +43 -51
- data/vendor/local/share/doc/groonga/ja/html/install/windows.html +29 -37
- data/vendor/local/share/doc/groonga/ja/html/limitations.html +30 -36
- data/vendor/local/share/doc/groonga/ja/html/news.html +1234 -384
- data/vendor/local/share/doc/groonga/ja/html/news/0.x.html +82 -82
- data/vendor/local/share/doc/groonga/ja/html/news/1.0.x.html +146 -154
- data/vendor/local/share/doc/groonga/ja/html/news/1.1.x.html +25 -33
- data/vendor/local/share/doc/groonga/ja/html/news/1.2.x.html +191 -199
- data/vendor/local/share/doc/groonga/ja/html/news/1.3.x.html +41 -49
- data/vendor/local/share/doc/groonga/ja/html/news/2.x.html +283 -291
- data/vendor/local/share/doc/groonga/ja/html/news/3.x.html +229 -237
- data/vendor/local/share/doc/groonga/ja/html/news/4.x.html +274 -280
- data/vendor/local/share/doc/groonga/ja/html/news/5.x.html +475 -593
- data/vendor/local/share/doc/groonga/ja/html/news/6.x.html +313 -390
- data/vendor/local/share/doc/groonga/ja/html/news/senna.html +31 -39
- data/vendor/local/share/doc/groonga/ja/html/objects.inv +0 -0
- data/vendor/local/share/doc/groonga/ja/html/reference.html +208 -198
- data/vendor/local/share/doc/groonga/ja/html/reference/alias.html +70 -78
- data/vendor/local/share/doc/groonga/ja/html/reference/api.html +50 -57
- data/vendor/local/share/doc/groonga/ja/html/reference/api/global_configurations.html +57 -72
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_cache.html +107 -139
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_column.html +137 -173
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_command_version.html +40 -52
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_content_type.html +46 -54
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_ctx.html +184 -244
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_db.html +99 -131
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_encoding.html +57 -77
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_expr.html +100 -120
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_geo.html +71 -95
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_hook.html +75 -99
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_ii.html +37 -45
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_index_cursor.html +54 -70
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_info.html +71 -95
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_inspect.html +487 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_match_escalation.html +49 -65
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_obj.html +286 -352
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_proc.html +64 -84
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_search.html +44 -56
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_table.html +219 -299
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_table_cursor.html +116 -156
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_thread.html +70 -93
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_type.html +42 -54
- data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_user_data.html +36 -48
- data/vendor/local/share/doc/groonga/ja/html/reference/api/overview.html +48 -62
- data/vendor/local/share/doc/groonga/ja/html/reference/api/plugin.html +94 -118
- data/vendor/local/share/doc/groonga/ja/html/reference/cast.html +39 -25
- data/vendor/local/share/doc/groonga/ja/html/reference/column.html +15 -23
- data/vendor/local/share/doc/groonga/ja/html/reference/columns/index.html +15 -23
- data/vendor/local/share/doc/groonga/ja/html/reference/columns/pseudo.html +28 -32
- data/vendor/local/share/doc/groonga/ja/html/reference/columns/scalar.html +15 -23
- data/vendor/local/share/doc/groonga/ja/html/reference/columns/vector.html +76 -84
- data/vendor/local/share/doc/groonga/ja/html/reference/command.html +76 -84
- data/vendor/local/share/doc/groonga/ja/html/reference/command/command_version.html +25 -33
- data/vendor/local/share/doc/groonga/ja/html/reference/command/output_format.html +51 -59
- data/vendor/local/share/doc/groonga/ja/html/reference/command/pretty_print.html +20 -28
- data/vendor/local/share/doc/groonga/ja/html/reference/command/request_id.html +21 -29
- data/vendor/local/share/doc/groonga/ja/html/reference/command/request_timeout.html +27 -35
- data/vendor/local/share/doc/groonga/ja/html/reference/command/return_code.html +101 -109
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/cache_limit.html +39 -45
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/check.html +84 -72
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/clearlock.html +30 -36
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/column_copy.html +104 -112
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/column_create.html +271 -237
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/column_list.html +100 -102
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/column_remove.html +37 -43
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/column_rename.html +41 -47
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/config_delete.html +38 -46
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/config_get.html +39 -47
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/config_set.html +39 -47
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/database_unmap.html +34 -42
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/define_selector.html +70 -62
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/defrag.html +30 -36
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/delete.html +42 -44
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/dump.html +59 -68
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/io_flush.html +300 -126
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/load.html +212 -80
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/lock_acquire.html +42 -50
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/lock_clear.html +40 -46
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/lock_release.html +41 -49
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/log_level.html +57 -63
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/log_put.html +32 -37
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/log_reopen.html +30 -37
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/logical_count.html +246 -178
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/logical_parameters.html +51 -59
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/logical_range_filter.html +479 -175
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/logical_select.html +718 -326
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/logical_shard_list.html +34 -42
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/logical_table_remove.html +145 -153
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/normalize.html +78 -85
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/normalizer_list.html +40 -47
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/object_exist.html +36 -44
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/object_inspect.html +360 -362
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/object_list.html +221 -229
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/object_remove.html +47 -55
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/plugin_register.html +32 -40
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/plugin_unregister.html +33 -41
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/query_expand.html +20 -28
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/quit.html +21 -29
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/range_filter.html +20 -28
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/register.html +35 -43
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/reindex.html +42 -48
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/request_cancel.html +57 -57
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/ruby_eval.html +36 -43
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/ruby_load.html +36 -43
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/schema.html +317 -325
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/select.html +1246 -917
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/shutdown.html +50 -58
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/status.html +77 -85
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/suggest.html +109 -123
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/table_copy.html +29 -37
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/table_create.html +131 -140
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/table_list.html +49 -49
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/table_remove.html +87 -95
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/table_rename.html +36 -44
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/table_tokenize.html +44 -52
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/thread_limit.html +38 -46
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/tokenize.html +93 -100
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/tokenizer_list.html +40 -47
- data/vendor/local/share/doc/groonga/ja/html/reference/commands/truncate.html +39 -45
- data/vendor/local/share/doc/groonga/ja/html/reference/configuration.html +36 -44
- data/vendor/local/share/doc/groonga/ja/html/reference/executables.html +19 -27
- data/vendor/local/share/doc/groonga/ja/html/reference/executables/grndb.html +125 -107
- data/vendor/local/share/doc/groonga/ja/html/reference/executables/grnslap.html +23 -29
- data/vendor/local/share/doc/groonga/ja/html/reference/executables/groonga-benchmark.html +62 -62
- data/vendor/local/share/doc/groonga/ja/html/reference/executables/groonga-httpd.html +132 -140
- data/vendor/local/share/doc/groonga/ja/html/reference/executables/groonga-server-http.html +23 -31
- data/vendor/local/share/doc/groonga/ja/html/reference/executables/groonga-suggest-create-dataset.html +25 -33
- data/vendor/local/share/doc/groonga/ja/html/reference/executables/groonga-suggest-httpd.html +166 -174
- data/vendor/local/share/doc/groonga/ja/html/reference/executables/groonga-suggest-learner.html +31 -39
- data/vendor/local/share/doc/groonga/ja/html/reference/executables/groonga.html +189 -165
- data/vendor/local/share/doc/groonga/ja/html/reference/function.html +59 -64
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/between.html +69 -77
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/cast_loose.html +208 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/edit_distance.html +48 -54
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/fuzzy_search.html +37 -45
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/geo_distance.html +115 -124
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/geo_in_circle.html +66 -72
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/geo_in_rectangle.html +55 -61
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/highlight_full.html +69 -77
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/highlight_html.html +60 -68
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/html_untag.html +54 -62
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/in_records.html +85 -93
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/in_values.html +54 -62
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/math_abs.html +54 -62
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/now.html +39 -47
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/number_classify.html +35 -43
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/prefix_rk_search.html +67 -75
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/query.html +130 -138
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/rand.html +44 -51
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/snippet_html.html +61 -69
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/string_length.html +36 -44
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/string_substring.html +38 -46
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/sub_filter.html +63 -71
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/time_classify_day.html +36 -44
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/time_classify_day_of_week.html +276 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/time_classify_hour.html +36 -44
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/time_classify_minute.html +35 -43
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/time_classify_month.html +35 -43
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/time_classify_second.html +35 -43
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/time_classify_week.html +35 -43
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/time_classify_year.html +36 -44
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/vector_find.html +353 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/vector_new.html +39 -47
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/vector_size.html +52 -61
- data/vendor/local/share/doc/groonga/ja/html/reference/functions/vector_slice.html +38 -46
- data/vendor/local/share/doc/groonga/ja/html/reference/grn_expr.html +38 -46
- data/vendor/local/share/doc/groonga/ja/html/reference/grn_expr/query_syntax.html +200 -208
- data/vendor/local/share/doc/groonga/ja/html/reference/grn_expr/script_syntax.html +375 -382
- data/vendor/local/share/doc/groonga/ja/html/reference/indexing.html +39 -47
- data/vendor/local/share/doc/groonga/ja/html/reference/log.html +125 -144
- data/vendor/local/share/doc/groonga/ja/html/reference/normalizers.html +36 -70
- data/vendor/local/share/doc/groonga/ja/html/reference/normalizers/normalizer_auto.html +168 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/normalizers/normalizer_nfkc100.html +887 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/normalizers/normalizer_nfkc51.html +160 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/operations.html +26 -34
- data/vendor/local/share/doc/groonga/ja/html/reference/operations/geolocation_search.html +38 -46
- data/vendor/local/share/doc/groonga/ja/html/reference/operations/prefix_rk_search.html +41 -49
- data/vendor/local/share/doc/groonga/ja/html/reference/output.html +42 -50
- data/vendor/local/share/doc/groonga/ja/html/reference/query_expanders.html +20 -28
- data/vendor/local/share/doc/groonga/ja/html/reference/query_expanders/tsv.html +68 -76
- data/vendor/local/share/doc/groonga/ja/html/reference/regular_expression.html +178 -184
- data/vendor/local/share/doc/groonga/ja/html/reference/scorer.html +38 -46
- data/vendor/local/share/doc/groonga/ja/html/reference/scorers/scorer_tf_at_most.html +38 -46
- data/vendor/local/share/doc/groonga/ja/html/reference/scorers/scorer_tf_idf.html +39 -47
- data/vendor/local/share/doc/groonga/ja/html/reference/sharding.html +63 -73
- data/vendor/local/share/doc/groonga/ja/html/reference/suggest.html +43 -51
- data/vendor/local/share/doc/groonga/ja/html/reference/suggest/completion.html +130 -138
- data/vendor/local/share/doc/groonga/ja/html/reference/suggest/correction.html +72 -80
- data/vendor/local/share/doc/groonga/ja/html/reference/suggest/introduction.html +68 -76
- data/vendor/local/share/doc/groonga/ja/html/reference/suggest/suggestion.html +76 -86
- data/vendor/local/share/doc/groonga/ja/html/reference/tables.html +129 -137
- data/vendor/local/share/doc/groonga/ja/html/reference/token_filter/summary.html +145 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/token_filters.html +31 -215
- data/vendor/local/share/doc/groonga/ja/html/reference/token_filters/token_filter_nfkc100.html +617 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/token_filters/token_filter_stem.html +289 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/token_filters/token_filter_stop_word.html +284 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizer/summary.html +233 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers.html +42 -1349
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers/token_bigram.html +344 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers/token_bigram_ignore_blank.html +219 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers/token_bigram_ignore_blank_split_symbol.html +237 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers/token_bigram_ignore_blank_split_symbol_alpha.html +267 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers/token_bigram_ignore_blank_split_symbol_alpha_digit.html +287 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers/token_bigram_split_symbol.html +179 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers/token_bigram_split_symbol_alpha.html +199 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers/token_bigram_split_symbol_alpha_digit.html +209 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers/token_delimit.html +344 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers/token_delimit_null.html +160 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers/token_mecab.html +764 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers/token_regexp.html +284 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers/token_trigram.html +191 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers/token_unigram.html +191 -0
- data/vendor/local/share/doc/groonga/ja/html/reference/tuning.html +65 -73
- data/vendor/local/share/doc/groonga/ja/html/reference/types.html +48 -56
- data/vendor/local/share/doc/groonga/ja/html/reference/window_function.html +29 -37
- data/vendor/local/share/doc/groonga/ja/html/reference/window_functions/record_number.html +37 -45
- data/vendor/local/share/doc/groonga/ja/html/reference/window_functions/window_count.html +37 -45
- data/vendor/local/share/doc/groonga/ja/html/reference/window_functions/window_record_number.html +37 -45
- data/vendor/local/share/doc/groonga/ja/html/reference/window_functions/window_sum.html +37 -45
- data/vendor/local/share/doc/groonga/ja/html/search.html +13 -24
- data/vendor/local/share/doc/groonga/ja/html/searchindex.js +1 -1
- data/vendor/local/share/doc/groonga/ja/html/server.html +15 -23
- data/vendor/local/share/doc/groonga/ja/html/server/gqtp.html +22 -30
- data/vendor/local/share/doc/groonga/ja/html/server/http.html +17 -25
- data/vendor/local/share/doc/groonga/ja/html/server/http/comparison.html +82 -90
- data/vendor/local/share/doc/groonga/ja/html/server/http/groonga-httpd.html +15 -23
- data/vendor/local/share/doc/groonga/ja/html/server/http/groonga.html +15 -23
- data/vendor/local/share/doc/groonga/ja/html/server/memcached.html +16 -24
- data/vendor/local/share/doc/groonga/ja/html/server/package.html +99 -107
- data/vendor/local/share/doc/groonga/ja/html/spec.html +19 -27
- data/vendor/local/share/doc/groonga/ja/html/spec/gqtp.html +201 -209
- data/vendor/local/share/doc/groonga/ja/html/spec/search.html +36 -36
- data/vendor/local/share/doc/groonga/ja/html/troubleshooting.html +15 -23
- data/vendor/local/share/doc/groonga/ja/html/troubleshooting/different_results_with_the_same_keyword.html +44 -48
- data/vendor/local/share/doc/groonga/ja/html/troubleshooting/how_to_analyze_error_message.html +21 -29
- data/vendor/local/share/doc/groonga/ja/html/troubleshooting/mmap_cannot_allocate_memory.html +24 -29
- data/vendor/local/share/doc/groonga/ja/html/tutorial.html +16 -24
- data/vendor/local/share/doc/groonga/ja/html/tutorial/data.html +32 -40
- data/vendor/local/share/doc/groonga/ja/html/tutorial/drilldown.html +62 -70
- data/vendor/local/share/doc/groonga/ja/html/tutorial/index.html +22 -30
- data/vendor/local/share/doc/groonga/ja/html/tutorial/introduction.html +77 -86
- data/vendor/local/share/doc/groonga/ja/html/tutorial/lexicon.html +15 -23
- data/vendor/local/share/doc/groonga/ja/html/tutorial/match_columns.html +56 -64
- data/vendor/local/share/doc/groonga/ja/html/tutorial/micro_blog.html +84 -92
- data/vendor/local/share/doc/groonga/ja/html/tutorial/network.html +20 -28
- data/vendor/local/share/doc/groonga/ja/html/tutorial/patricia_trie.html +18 -26
- data/vendor/local/share/doc/groonga/ja/html/tutorial/query_expansion.html +21 -29
- data/vendor/local/share/doc/groonga/ja/html/tutorial/search.html +43 -51
- data/vendor/local/share/doc/pcre/AUTHORS +3 -3
- data/vendor/local/share/doc/pcre/ChangeLog +53 -0
- data/vendor/local/share/doc/pcre/LICENCE +3 -3
- data/vendor/local/share/doc/pcre/NEWS +6 -0
- data/vendor/local/share/doc/pcre/html/NON-AUTOTOOLS-BUILD.txt +8 -7
- data/vendor/local/share/groonga/mruby/LEGAL +35 -35
- data/vendor/local/share/license/cv2pdb/{README → README.MD} +28 -10
- data/vendor/local/share/license/groonga-normalizer-mysql/README.md +14 -22
- data/vendor/local/share/license/lz4/LICENSE +2 -2
- data/vendor/local/share/license/mruby/AUTHORS +3 -0
- data/vendor/local/share/license/mruby/MITL +1 -1
- data/vendor/local/share/license/mruby/README.md +1 -1
- data/vendor/local/share/license/msgpack/README.md +5 -34
- data/vendor/local/share/license/pcre/LICENCE +3 -3
- data/vendor/local/share/man/man1/lz4.1 +221 -86
- data/vendor/local/share/man/man1/lz4c.1 +222 -32
- data/vendor/local/share/man/man1/lz4cat.1 +221 -30
- data/vendor/local/share/man/man1/unlz4.1 +223 -0
- metadata +231 -87
- data/lib/2.1/groonga.so +0 -0
- data/vendor/local/lib/groonga/plugins/expression_rewriters/optimizer.rb +0 -147
- data/vendor/local/lib/groonga/scripts/ruby/expression_tree/options.rb +0 -14
- data/vendor/local/share/doc/groonga/en/html/_static/ajax-loader.gif +0 -0
- data/vendor/local/share/doc/groonga/en/html/_static/comment-bright.png +0 -0
- data/vendor/local/share/doc/groonga/en/html/_static/comment-close.png +0 -0
- data/vendor/local/share/doc/groonga/en/html/_static/comment.png +0 -0
- data/vendor/local/share/doc/groonga/en/html/_static/down-pressed.png +0 -0
- data/vendor/local/share/doc/groonga/en/html/_static/down.png +0 -0
- data/vendor/local/share/doc/groonga/en/html/_static/up-pressed.png +0 -0
- data/vendor/local/share/doc/groonga/en/html/_static/up.png +0 -0
- data/vendor/local/share/doc/groonga/en/html/_static/websupport.js +0 -808
- data/vendor/local/share/doc/groonga/ja/html/_static/ajax-loader.gif +0 -0
- data/vendor/local/share/doc/groonga/ja/html/_static/comment-bright.png +0 -0
- data/vendor/local/share/doc/groonga/ja/html/_static/comment-close.png +0 -0
- data/vendor/local/share/doc/groonga/ja/html/_static/comment.png +0 -0
- data/vendor/local/share/doc/groonga/ja/html/_static/down-pressed.png +0 -0
- data/vendor/local/share/doc/groonga/ja/html/_static/down.png +0 -0
- data/vendor/local/share/doc/groonga/ja/html/_static/up-pressed.png +0 -0
- data/vendor/local/share/doc/groonga/ja/html/_static/up.png +0 -0
- data/vendor/local/share/doc/groonga/ja/html/_static/websupport.js +0 -808
@@ -0,0 +1,259 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
<!DOCTYPE html>
|
4
|
+
|
5
|
+
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
|
6
|
+
<head>
|
7
|
+
<meta charset="utf-8" />
|
8
|
+
<title>7.8.1. Summary — Groonga v9.0.2 documentation</title>
|
9
|
+
<link rel="stylesheet" href="../../_static/groonga.css" type="text/css" />
|
10
|
+
<link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
|
11
|
+
|
12
|
+
<script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
|
13
|
+
<script type="text/javascript" src="../../_static/jquery.js"></script>
|
14
|
+
<script type="text/javascript" src="../../_static/underscore.js"></script>
|
15
|
+
<script type="text/javascript" src="../../_static/doctools.js"></script>
|
16
|
+
<script type="text/javascript" src="../../_static/language_data.js"></script>
|
17
|
+
|
18
|
+
<link rel="shortcut icon" href="../../_static/favicon.ico"/>
|
19
|
+
<link rel="index" title="Index" href="../../genindex.html" />
|
20
|
+
<link rel="search" title="Search" href="../../search.html" />
|
21
|
+
<link rel="next" title="7.8.2. TokenBigram" href="../tokenizers/token_bigram.html" />
|
22
|
+
<link rel="prev" title="7.8. Tokenizers" href="../tokenizers.html" />
|
23
|
+
</head><body>
|
24
|
+
<div class="header">
|
25
|
+
<h1 class="title">
|
26
|
+
<a id="top-link" href="../../index.html">
|
27
|
+
<span class="project">groonga</span>
|
28
|
+
<span class="separator">-</span>
|
29
|
+
<span class="description">An open-source fulltext search engine and column store.</span>
|
30
|
+
</a>
|
31
|
+
</h1>
|
32
|
+
|
33
|
+
<div class="other-language-links">
|
34
|
+
<ul>
|
35
|
+
<li><a href="../../../../ja/html/reference/tokenizer/summary.html">日本語</a></li>
|
36
|
+
</ul>
|
37
|
+
</div>
|
38
|
+
</div>
|
39
|
+
|
40
|
+
|
41
|
+
<div class="related" role="navigation" aria-label="related navigation">
|
42
|
+
<h3>Navigation</h3>
|
43
|
+
<ul>
|
44
|
+
<li class="right" style="margin-right: 10px">
|
45
|
+
<a href="../../genindex.html" title="General Index"
|
46
|
+
accesskey="I">index</a></li>
|
47
|
+
<li class="right" >
|
48
|
+
<a href="../tokenizers/token_bigram.html" title="7.8.2. TokenBigram"
|
49
|
+
accesskey="N">next</a> |</li>
|
50
|
+
<li class="right" >
|
51
|
+
<a href="../tokenizers.html" title="7.8. Tokenizers"
|
52
|
+
accesskey="P">previous</a> |</li>
|
53
|
+
<li class="nav-item nav-item-0"><a href="../../index.html">Groonga v9.0.2 documentation</a> »</li>
|
54
|
+
<li class="nav-item nav-item-1"><a href="../../reference.html" >7. Reference manual</a> »</li>
|
55
|
+
<li class="nav-item nav-item-2"><a href="../tokenizers.html" accesskey="U">7.8. Tokenizers</a> »</li>
|
56
|
+
</ul>
|
57
|
+
</div>
|
58
|
+
|
59
|
+
<div class="document">
|
60
|
+
<div class="documentwrapper">
|
61
|
+
<div class="bodywrapper">
|
62
|
+
<div class="body" role="main">
|
63
|
+
|
64
|
+
<div class="section" id="summary">
|
65
|
+
<h1>7.8.1. Summary<a class="headerlink" href="#summary" title="Permalink to this headline">¶</a></h1>
|
66
|
+
<p>Groonga has tokenizer module that tokenizes text. It is used when
|
67
|
+
the following cases:</p>
|
68
|
+
<blockquote>
|
69
|
+
<div><ul>
|
70
|
+
<li><p>Indexing text</p>
|
71
|
+
<div class="figure align-center" id="id1">
|
72
|
+
<a class="reference internal image-reference" href="../../_images/used-when-indexing.png"><img alt="../../_images/used-when-indexing.png" src="../../_images/used-when-indexing.png" style="width: 80%;" /></a>
|
73
|
+
<p class="caption"><span class="caption-text">Tokenizer is used when indexing text.</span><a class="headerlink" href="#id1" title="Permalink to this image">¶</a></p>
|
74
|
+
</div>
|
75
|
+
</li>
|
76
|
+
<li><p>Searching by query</p>
|
77
|
+
<div class="figure align-center" id="id2">
|
78
|
+
<a class="reference internal image-reference" href="../../_images/used-when-searching.png"><img alt="../../_images/used-when-searching.png" src="../../_images/used-when-searching.png" style="width: 80%;" /></a>
|
79
|
+
<p class="caption"><span class="caption-text">Tokenizer is used when searching by query.</span><a class="headerlink" href="#id2" title="Permalink to this image">¶</a></p>
|
80
|
+
</div>
|
81
|
+
</li>
|
82
|
+
</ul>
|
83
|
+
</div></blockquote>
|
84
|
+
<p>Tokenizer is an important module for full-text search. You can change
|
85
|
+
trade-off between <a class="reference external" href="http://en.wikipedia.org/wiki/Precision_and_recall">precision and recall</a> by changing
|
86
|
+
tokenizer.</p>
|
87
|
+
<p>Normally, <a class="reference internal" href="../tokenizers/token_bigram.html#token-bigram"><span class="std std-ref">TokenBigram</span></a> is a suitable tokenizer. If you don’t
|
88
|
+
know much about tokenizer, it’s recommended that you choose
|
89
|
+
<a class="reference internal" href="../tokenizers/token_bigram.html#token-bigram"><span class="std std-ref">TokenBigram</span></a>.</p>
|
90
|
+
<p>You can try a tokenizer by <a class="reference internal" href="../commands/tokenize.html"><span class="doc">tokenize</span></a> and
|
91
|
+
<a class="reference internal" href="../commands/table_tokenize.html"><span class="doc">table_tokenize</span></a>. Here is an example to
|
92
|
+
try <a class="reference internal" href="../tokenizers/token_bigram.html#token-bigram"><span class="std std-ref">TokenBigram</span></a> tokenizer by
|
93
|
+
<a class="reference internal" href="../commands/tokenize.html"><span class="doc">tokenize</span></a>:</p>
|
94
|
+
<p>Execution example:</p>
|
95
|
+
<div class="highlight-none notranslate"><div class="highlight"><pre><span></span>tokenize TokenBigram "Hello World"
|
96
|
+
# [
|
97
|
+
# [
|
98
|
+
# 0,
|
99
|
+
# 1337566253.89858,
|
100
|
+
# 0.000355720520019531
|
101
|
+
# ],
|
102
|
+
# [
|
103
|
+
# {
|
104
|
+
# "position": 0,
|
105
|
+
# "force_prefix": false,
|
106
|
+
# "value": "He"
|
107
|
+
# },
|
108
|
+
# {
|
109
|
+
# "position": 1,
|
110
|
+
# "force_prefix": false,
|
111
|
+
# "value": "el"
|
112
|
+
# },
|
113
|
+
# {
|
114
|
+
# "position": 2,
|
115
|
+
# "force_prefix": false,
|
116
|
+
# "value": "ll"
|
117
|
+
# },
|
118
|
+
# {
|
119
|
+
# "position": 3,
|
120
|
+
# "force_prefix": false,
|
121
|
+
# "value": "lo"
|
122
|
+
# },
|
123
|
+
# {
|
124
|
+
# "position": 4,
|
125
|
+
# "force_prefix": false,
|
126
|
+
# "value": "o "
|
127
|
+
# },
|
128
|
+
# {
|
129
|
+
# "position": 5,
|
130
|
+
# "force_prefix": false,
|
131
|
+
# "value": " W"
|
132
|
+
# },
|
133
|
+
# {
|
134
|
+
# "position": 6,
|
135
|
+
# "force_prefix": false,
|
136
|
+
# "value": "Wo"
|
137
|
+
# },
|
138
|
+
# {
|
139
|
+
# "position": 7,
|
140
|
+
# "force_prefix": false,
|
141
|
+
# "value": "or"
|
142
|
+
# },
|
143
|
+
# {
|
144
|
+
# "position": 8,
|
145
|
+
# "force_prefix": false,
|
146
|
+
# "value": "rl"
|
147
|
+
# },
|
148
|
+
# {
|
149
|
+
# "position": 9,
|
150
|
+
# "force_prefix": false,
|
151
|
+
# "value": "ld"
|
152
|
+
# },
|
153
|
+
# {
|
154
|
+
# "position": 10,
|
155
|
+
# "force_prefix": false,
|
156
|
+
# "value": "d"
|
157
|
+
# }
|
158
|
+
# ]
|
159
|
+
# ]
|
160
|
+
</pre></div>
|
161
|
+
</div>
|
162
|
+
<p>“tokenize” is the process that extracts zero or more tokens from a
|
163
|
+
text. There are some “tokenize” methods.</p>
|
164
|
+
<p>For example, <code class="docutils literal notranslate"><span class="pre">Hello</span> <span class="pre">World</span></code> is tokenized to the following tokens by
|
165
|
+
bigram tokenize method:</p>
|
166
|
+
<blockquote>
|
167
|
+
<div><ul class="simple">
|
168
|
+
<li><p><code class="docutils literal notranslate"><span class="pre">He</span></code></p></li>
|
169
|
+
<li><p><code class="docutils literal notranslate"><span class="pre">el</span></code></p></li>
|
170
|
+
<li><p><code class="docutils literal notranslate"><span class="pre">ll</span></code></p></li>
|
171
|
+
<li><p><code class="docutils literal notranslate"><span class="pre">lo</span></code></p></li>
|
172
|
+
<li><p><code class="docutils literal notranslate"><span class="pre">o_</span></code> (<code class="docutils literal notranslate"><span class="pre">_</span></code> means a white-space)</p></li>
|
173
|
+
<li><p><code class="docutils literal notranslate"><span class="pre">_W</span></code> (<code class="docutils literal notranslate"><span class="pre">_</span></code> means a white-space)</p></li>
|
174
|
+
<li><p><code class="docutils literal notranslate"><span class="pre">Wo</span></code></p></li>
|
175
|
+
<li><p><code class="docutils literal notranslate"><span class="pre">or</span></code></p></li>
|
176
|
+
<li><p><code class="docutils literal notranslate"><span class="pre">rl</span></code></p></li>
|
177
|
+
<li><p><code class="docutils literal notranslate"><span class="pre">ld</span></code></p></li>
|
178
|
+
</ul>
|
179
|
+
</div></blockquote>
|
180
|
+
<p>In the above example, 10 tokens are extracted from one text <code class="docutils literal notranslate"><span class="pre">Hello</span>
|
181
|
+
<span class="pre">World</span></code>.</p>
|
182
|
+
<p>For example, <code class="docutils literal notranslate"><span class="pre">Hello</span> <span class="pre">World</span></code> is tokenized to the following tokens by
|
183
|
+
white-space-separate tokenize method:</p>
|
184
|
+
<blockquote>
|
185
|
+
<div><ul class="simple">
|
186
|
+
<li><p><code class="docutils literal notranslate"><span class="pre">Hello</span></code></p></li>
|
187
|
+
<li><p><code class="docutils literal notranslate"><span class="pre">World</span></code></p></li>
|
188
|
+
</ul>
|
189
|
+
</div></blockquote>
|
190
|
+
<p>In the above example, 2 tokens are extracted from one text <code class="docutils literal notranslate"><span class="pre">Hello</span>
|
191
|
+
<span class="pre">World</span></code>.</p>
|
192
|
+
<p>Token is used as search key. You can find indexed documents only by
|
193
|
+
tokens that are extracted by used tokenize method. For example, you
|
194
|
+
can find <code class="docutils literal notranslate"><span class="pre">Hello</span> <span class="pre">World</span></code> by <code class="docutils literal notranslate"><span class="pre">ll</span></code> with bigram tokenize method but you
|
195
|
+
can’t find <code class="docutils literal notranslate"><span class="pre">Hello</span> <span class="pre">World</span></code> by <code class="docutils literal notranslate"><span class="pre">ll</span></code> with white-space-separate tokenize
|
196
|
+
method. Because white-space-separate tokenize method doesn’t extract
|
197
|
+
<code class="docutils literal notranslate"><span class="pre">ll</span></code> token. It just extracts <code class="docutils literal notranslate"><span class="pre">Hello</span></code> and <code class="docutils literal notranslate"><span class="pre">World</span></code> tokens.</p>
|
198
|
+
<p>In general, tokenize method that generates small tokens increases
|
199
|
+
recall but decreases precision. Tokenize method that generates large
|
200
|
+
tokens increases precision but decreases recall.</p>
|
201
|
+
<p>For example, we can find <code class="docutils literal notranslate"><span class="pre">Hello</span> <span class="pre">World</span></code> and <code class="docutils literal notranslate"><span class="pre">A</span> <span class="pre">or</span> <span class="pre">B</span></code> by <code class="docutils literal notranslate"><span class="pre">or</span></code> with
|
202
|
+
bigram tokenize method. <code class="docutils literal notranslate"><span class="pre">Hello</span> <span class="pre">World</span></code> is a noise for people who
|
203
|
+
wants to search “logical and”. It means that precision is
|
204
|
+
decreased. But recall is increased.</p>
|
205
|
+
<p>We can find only <code class="docutils literal notranslate"><span class="pre">A</span> <span class="pre">or</span> <span class="pre">B</span></code> by <code class="docutils literal notranslate"><span class="pre">or</span></code> with white-space-separate
|
206
|
+
tokenize method. Because <code class="docutils literal notranslate"><span class="pre">World</span></code> is tokenized to one token <code class="docutils literal notranslate"><span class="pre">World</span></code>
|
207
|
+
with white-space-separate tokenize method. It means that precision is
|
208
|
+
increased for people who wants to search “logical and”. But recall is
|
209
|
+
decreased because <code class="docutils literal notranslate"><span class="pre">Hello</span> <span class="pre">World</span></code> that contains <code class="docutils literal notranslate"><span class="pre">or</span></code> isn’t found.</p>
|
210
|
+
</div>
|
211
|
+
|
212
|
+
|
213
|
+
</div>
|
214
|
+
</div>
|
215
|
+
</div>
|
216
|
+
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
|
217
|
+
<div class="sphinxsidebarwrapper">
|
218
|
+
<h4>Previous topic</h4>
|
219
|
+
<p class="topless"><a href="../tokenizers.html"
|
220
|
+
title="previous chapter">7.8. Tokenizers</a></p>
|
221
|
+
<h4>Next topic</h4>
|
222
|
+
<p class="topless"><a href="../tokenizers/token_bigram.html"
|
223
|
+
title="next chapter">7.8.2. <code class="docutils literal notranslate"><span class="pre">TokenBigram</span></code></a></p>
|
224
|
+
<div id="searchbox" style="display: none" role="search">
|
225
|
+
<h3>Quick search</h3>
|
226
|
+
<div class="searchformwrapper">
|
227
|
+
<form class="search" action="../../search.html" method="get">
|
228
|
+
<input type="text" name="q" />
|
229
|
+
<input type="submit" value="Go" />
|
230
|
+
</form>
|
231
|
+
</div>
|
232
|
+
</div>
|
233
|
+
<script type="text/javascript">$('#searchbox').show(0);</script>
|
234
|
+
</div>
|
235
|
+
</div>
|
236
|
+
<div class="clearer"></div>
|
237
|
+
</div>
|
238
|
+
<div class="related" role="navigation" aria-label="related navigation">
|
239
|
+
<h3>Navigation</h3>
|
240
|
+
<ul>
|
241
|
+
<li class="right" style="margin-right: 10px">
|
242
|
+
<a href="../../genindex.html" title="General Index"
|
243
|
+
>index</a></li>
|
244
|
+
<li class="right" >
|
245
|
+
<a href="../tokenizers/token_bigram.html" title="7.8.2. TokenBigram"
|
246
|
+
>next</a> |</li>
|
247
|
+
<li class="right" >
|
248
|
+
<a href="../tokenizers.html" title="7.8. Tokenizers"
|
249
|
+
>previous</a> |</li>
|
250
|
+
<li class="nav-item nav-item-0"><a href="../../index.html">Groonga v9.0.2 documentation</a> »</li>
|
251
|
+
<li class="nav-item nav-item-1"><a href="../../reference.html" >7. Reference manual</a> »</li>
|
252
|
+
<li class="nav-item nav-item-2"><a href="../tokenizers.html" >7.8. Tokenizers</a> »</li>
|
253
|
+
</ul>
|
254
|
+
</div>
|
255
|
+
<div class="footer" role="contentinfo">
|
256
|
+
© Copyright 2009-2019, Brazil, Inc.
|
257
|
+
</div>
|
258
|
+
</body>
|
259
|
+
</html>
|
@@ -1,34 +1,26 @@
|
|
1
1
|
|
2
2
|
|
3
|
-
<!DOCTYPE html
|
4
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
3
|
+
<!DOCTYPE html>
|
5
4
|
|
6
5
|
<html xmlns="http://www.w3.org/1999/xhtml" lang="en">
|
7
6
|
<head>
|
8
|
-
<meta
|
9
|
-
<title>7.8. Tokenizers — Groonga
|
7
|
+
<meta charset="utf-8" />
|
8
|
+
<title>7.8. Tokenizers — Groonga v9.0.2 documentation</title>
|
10
9
|
<link rel="stylesheet" href="../_static/groonga.css" type="text/css" />
|
11
10
|
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
|
12
|
-
|
13
|
-
|
14
|
-
URL_ROOT: '../',
|
15
|
-
VERSION: '7.1.0-73-g6d02cfa',
|
16
|
-
COLLAPSE_INDEX: false,
|
17
|
-
FILE_SUFFIX: '.html',
|
18
|
-
HAS_SOURCE: false,
|
19
|
-
SOURCELINK_SUFFIX: '.txt'
|
20
|
-
};
|
21
|
-
</script>
|
11
|
+
|
12
|
+
<script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
|
22
13
|
<script type="text/javascript" src="../_static/jquery.js"></script>
|
23
14
|
<script type="text/javascript" src="../_static/underscore.js"></script>
|
24
15
|
<script type="text/javascript" src="../_static/doctools.js"></script>
|
16
|
+
<script type="text/javascript" src="../_static/language_data.js"></script>
|
17
|
+
|
25
18
|
<link rel="shortcut icon" href="../_static/favicon.ico"/>
|
26
19
|
<link rel="index" title="Index" href="../genindex.html" />
|
27
20
|
<link rel="search" title="Search" href="../search.html" />
|
28
|
-
<link rel="next" title="7.
|
29
|
-
<link rel="prev" title="7.7.
|
30
|
-
</head>
|
31
|
-
<body>
|
21
|
+
<link rel="next" title="7.8.1. Summary" href="tokenizer/summary.html" />
|
22
|
+
<link rel="prev" title="7.7.2.3. NormalizerNFKC51" href="normalizers/normalizer_nfkc51.html" />
|
23
|
+
</head><body>
|
32
24
|
<div class="header">
|
33
25
|
<h1 class="title">
|
34
26
|
<a id="top-link" href="../index.html">
|
@@ -53,12 +45,12 @@
|
|
53
45
|
<a href="../genindex.html" title="General Index"
|
54
46
|
accesskey="I">index</a></li>
|
55
47
|
<li class="right" >
|
56
|
-
<a href="
|
48
|
+
<a href="tokenizer/summary.html" title="7.8.1. Summary"
|
57
49
|
accesskey="N">next</a> |</li>
|
58
50
|
<li class="right" >
|
59
|
-
<a href="normalizers.html" title="7.7.
|
51
|
+
<a href="normalizers/normalizer_nfkc51.html" title="7.7.2.3. NormalizerNFKC51"
|
60
52
|
accesskey="P">previous</a> |</li>
|
61
|
-
<li class="nav-item nav-item-0"><a href="../index.html">Groonga
|
53
|
+
<li class="nav-item nav-item-0"><a href="../index.html">Groonga v9.0.2 documentation</a> »</li>
|
62
54
|
<li class="nav-item nav-item-1"><a href="../reference.html" accesskey="U">7. Reference manual</a> »</li>
|
63
55
|
</ul>
|
64
56
|
</div>
|
@@ -70,1403 +62,24 @@
|
|
70
62
|
|
71
63
|
<div class="section" id="tokenizers">
|
72
64
|
<h1>7.8. Tokenizers<a class="headerlink" href="#tokenizers" title="Permalink to this headline">¶</a></h1>
|
73
|
-
<div class="
|
74
|
-
<
|
75
|
-
<
|
76
|
-
|
77
|
-
<
|
78
|
-
<
|
79
|
-
<li><
|
80
|
-
<
|
81
|
-
<a class="reference internal
|
82
|
-
<
|
83
|
-
</
|
84
|
-
</li>
|
85
|
-
<li><
|
86
|
-
<
|
87
|
-
<a class="reference internal
|
88
|
-
<
|
89
|
-
</
|
90
|
-
</li>
|
91
|
-
</ul>
|
92
|
-
</div></blockquote>
|
93
|
-
<p>Tokenizer is an important module for full-text search. You can change
|
94
|
-
trade-off between <a class="reference external" href="http://en.wikipedia.org/wiki/Precision_and_recall">precision and recall</a> by changing
|
95
|
-
tokenizer.</p>
|
96
|
-
<p>Normally, <a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a> is a suitable tokenizer. If you don't
|
97
|
-
know much about tokenizer, it's recommended that you choose
|
98
|
-
<a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a>.</p>
|
99
|
-
<p>You can try a tokenizer by <a class="reference internal" href="commands/tokenize.html"><span class="doc">tokenize</span></a> and
|
100
|
-
<a class="reference internal" href="commands/table_tokenize.html"><span class="doc">table_tokenize</span></a>. Here is an example to
|
101
|
-
try <a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a> tokenizer by
|
102
|
-
<a class="reference internal" href="commands/tokenize.html"><span class="doc">tokenize</span></a>:</p>
|
103
|
-
<p>Execution example:</p>
|
104
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigram "Hello World"
|
105
|
-
# [
|
106
|
-
# [
|
107
|
-
# 0,
|
108
|
-
# 1337566253.89858,
|
109
|
-
# 0.000355720520019531
|
110
|
-
# ],
|
111
|
-
# [
|
112
|
-
# {
|
113
|
-
# "position": 0,
|
114
|
-
# "force_prefix": false,
|
115
|
-
# "value": "He"
|
116
|
-
# },
|
117
|
-
# {
|
118
|
-
# "position": 1,
|
119
|
-
# "force_prefix": false,
|
120
|
-
# "value": "el"
|
121
|
-
# },
|
122
|
-
# {
|
123
|
-
# "position": 2,
|
124
|
-
# "force_prefix": false,
|
125
|
-
# "value": "ll"
|
126
|
-
# },
|
127
|
-
# {
|
128
|
-
# "position": 3,
|
129
|
-
# "force_prefix": false,
|
130
|
-
# "value": "lo"
|
131
|
-
# },
|
132
|
-
# {
|
133
|
-
# "position": 4,
|
134
|
-
# "force_prefix": false,
|
135
|
-
# "value": "o "
|
136
|
-
# },
|
137
|
-
# {
|
138
|
-
# "position": 5,
|
139
|
-
# "force_prefix": false,
|
140
|
-
# "value": " W"
|
141
|
-
# },
|
142
|
-
# {
|
143
|
-
# "position": 6,
|
144
|
-
# "force_prefix": false,
|
145
|
-
# "value": "Wo"
|
146
|
-
# },
|
147
|
-
# {
|
148
|
-
# "position": 7,
|
149
|
-
# "force_prefix": false,
|
150
|
-
# "value": "or"
|
151
|
-
# },
|
152
|
-
# {
|
153
|
-
# "position": 8,
|
154
|
-
# "force_prefix": false,
|
155
|
-
# "value": "rl"
|
156
|
-
# },
|
157
|
-
# {
|
158
|
-
# "position": 9,
|
159
|
-
# "force_prefix": false,
|
160
|
-
# "value": "ld"
|
161
|
-
# },
|
162
|
-
# {
|
163
|
-
# "position": 10,
|
164
|
-
# "force_prefix": false,
|
165
|
-
# "value": "d"
|
166
|
-
# }
|
167
|
-
# ]
|
168
|
-
# ]
|
169
|
-
</pre></div>
|
170
|
-
</div>
|
171
|
-
</div>
|
172
|
-
<div class="section" id="what-is-tokenize">
|
173
|
-
<h2>7.8.2. What is "tokenize"?<a class="headerlink" href="#what-is-tokenize" title="Permalink to this headline">¶</a></h2>
|
174
|
-
<p>"tokenize" is the process that extracts zero or more tokens from a
|
175
|
-
text. There are some "tokenize" methods.</p>
|
176
|
-
<p>For example, <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> is tokenized to the following tokens by
|
177
|
-
bigram tokenize method:</p>
|
178
|
-
<blockquote>
|
179
|
-
<div><ul class="simple">
|
180
|
-
<li><code class="docutils literal"><span class="pre">He</span></code></li>
|
181
|
-
<li><code class="docutils literal"><span class="pre">el</span></code></li>
|
182
|
-
<li><code class="docutils literal"><span class="pre">ll</span></code></li>
|
183
|
-
<li><code class="docutils literal"><span class="pre">lo</span></code></li>
|
184
|
-
<li><code class="docutils literal"><span class="pre">o_</span></code> (<code class="docutils literal"><span class="pre">_</span></code> means a white-space)</li>
|
185
|
-
<li><code class="docutils literal"><span class="pre">_W</span></code> (<code class="docutils literal"><span class="pre">_</span></code> means a white-space)</li>
|
186
|
-
<li><code class="docutils literal"><span class="pre">Wo</span></code></li>
|
187
|
-
<li><code class="docutils literal"><span class="pre">or</span></code></li>
|
188
|
-
<li><code class="docutils literal"><span class="pre">rl</span></code></li>
|
189
|
-
<li><code class="docutils literal"><span class="pre">ld</span></code></li>
|
190
|
-
</ul>
|
191
|
-
</div></blockquote>
|
192
|
-
<p>In the above example, 10 tokens are extracted from one text <code class="docutils literal"><span class="pre">Hello</span>
|
193
|
-
<span class="pre">World</span></code>.</p>
|
194
|
-
<p>For example, <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> is tokenized to the following tokens by
|
195
|
-
white-space-separate tokenize method:</p>
|
196
|
-
<blockquote>
|
197
|
-
<div><ul class="simple">
|
198
|
-
<li><code class="docutils literal"><span class="pre">Hello</span></code></li>
|
199
|
-
<li><code class="docutils literal"><span class="pre">World</span></code></li>
|
200
|
-
</ul>
|
201
|
-
</div></blockquote>
|
202
|
-
<p>In the above example, 2 tokens are extracted from one text <code class="docutils literal"><span class="pre">Hello</span>
|
203
|
-
<span class="pre">World</span></code>.</p>
|
204
|
-
<p>Token is used as search key. You can find indexed documents only by
|
205
|
-
tokens that are extracted by used tokenize method. For example, you
|
206
|
-
can find <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> by <code class="docutils literal"><span class="pre">ll</span></code> with bigram tokenize method but you
|
207
|
-
can't find <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> by <code class="docutils literal"><span class="pre">ll</span></code> with white-space-separate tokenize
|
208
|
-
method. Because white-space-separate tokenize method doesn't extract
|
209
|
-
<code class="docutils literal"><span class="pre">ll</span></code> token. It just extracts <code class="docutils literal"><span class="pre">Hello</span></code> and <code class="docutils literal"><span class="pre">World</span></code> tokens.</p>
|
210
|
-
<p>In general, tokenize method that generates small tokens increases
|
211
|
-
recall but decreases precision. Tokenize method that generates large
|
212
|
-
tokens increases precision but decreases recall.</p>
|
213
|
-
<p>For example, we can find <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> and <code class="docutils literal"><span class="pre">A</span> <span class="pre">or</span> <span class="pre">B</span></code> by <code class="docutils literal"><span class="pre">or</span></code> with
|
214
|
-
bigram tokenize method. <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> is a noise for people who
|
215
|
-
wants to search "logical and". It means that precision is
|
216
|
-
decreased. But recall is increased.</p>
|
217
|
-
<p>We can find only <code class="docutils literal"><span class="pre">A</span> <span class="pre">or</span> <span class="pre">B</span></code> by <code class="docutils literal"><span class="pre">or</span></code> with white-space-separate
|
218
|
-
tokenize method. Because <code class="docutils literal"><span class="pre">World</span></code> is tokenized to one token <code class="docutils literal"><span class="pre">World</span></code>
|
219
|
-
with white-space-separate tokenize method. It means that precision is
|
220
|
-
increased for people who wants to search "logical and". But recall is
|
221
|
-
decreased because <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> that contains <code class="docutils literal"><span class="pre">or</span></code> isn't found.</p>
|
222
|
-
</div>
|
223
|
-
<div class="section" id="built-in-tokenizsers">
|
224
|
-
<h2>7.8.3. Built-in tokenizsers<a class="headerlink" href="#built-in-tokenizsers" title="Permalink to this headline">¶</a></h2>
|
225
|
-
<p>Here is a list of built-in tokenizers:</p>
|
226
|
-
<blockquote>
|
227
|
-
<div><ul class="simple">
|
228
|
-
<li><code class="docutils literal"><span class="pre">TokenBigram</span></code></li>
|
229
|
-
<li><code class="docutils literal"><span class="pre">TokenBigramSplitSymbol</span></code></li>
|
230
|
-
<li><code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlpha</span></code></li>
|
231
|
-
<li><code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlphaDigit</span></code></li>
|
232
|
-
<li><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlank</span></code></li>
|
233
|
-
<li><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbol</span></code></li>
|
234
|
-
<li><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlpha</span></code></li>
|
235
|
-
<li><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlphaDigit</span></code></li>
|
236
|
-
<li><code class="docutils literal"><span class="pre">TokenUnigram</span></code></li>
|
237
|
-
<li><code class="docutils literal"><span class="pre">TokenTrigram</span></code></li>
|
238
|
-
<li><code class="docutils literal"><span class="pre">TokenDelimit</span></code></li>
|
239
|
-
<li><code class="docutils literal"><span class="pre">TokenDelimitNull</span></code></li>
|
240
|
-
<li><code class="docutils literal"><span class="pre">TokenMecab</span></code></li>
|
241
|
-
<li><code class="docutils literal"><span class="pre">TokenRegexp</span></code></li>
|
242
|
-
</ul>
|
243
|
-
</div></blockquote>
|
244
|
-
<div class="section" id="tokenbigram">
|
245
|
-
<span id="token-bigram"></span><h3>7.8.3.1. <code class="docutils literal"><span class="pre">TokenBigram</span></code><a class="headerlink" href="#tokenbigram" title="Permalink to this headline">¶</a></h3>
|
246
|
-
<p><code class="docutils literal"><span class="pre">TokenBigram</span></code> is a bigram based tokenizer. It's recommended to use
|
247
|
-
this tokenizer for most cases.</p>
|
248
|
-
<p>Bigram tokenize method tokenizes a text to two adjacent characters
|
249
|
-
tokens. For example, <code class="docutils literal"><span class="pre">Hello</span></code> is tokenized to the following tokens:</p>
|
250
|
-
<blockquote>
|
251
|
-
<div><ul class="simple">
|
252
|
-
<li><code class="docutils literal"><span class="pre">He</span></code></li>
|
253
|
-
<li><code class="docutils literal"><span class="pre">el</span></code></li>
|
254
|
-
<li><code class="docutils literal"><span class="pre">ll</span></code></li>
|
255
|
-
<li><code class="docutils literal"><span class="pre">lo</span></code></li>
|
256
|
-
</ul>
|
257
|
-
</div></blockquote>
|
258
|
-
<p>Bigram tokenize method is good for recall because you can find all
|
259
|
-
texts by query consists of two or more characters.</p>
|
260
|
-
<p>In general, you can't find all texts by query consists of one
|
261
|
-
character because one character token doesn't exist. But you can find
|
262
|
-
all texts by query consists of one character in Groonga. Because
|
263
|
-
Groonga find tokens that start with query by predictive search. For
|
264
|
-
example, Groonga can find <code class="docutils literal"><span class="pre">ll</span></code> and <code class="docutils literal"><span class="pre">lo</span></code> tokens by <code class="docutils literal"><span class="pre">l</span></code> query.</p>
|
265
|
-
<p>Bigram tokenize method isn't good for precision because you can find
|
266
|
-
texts that includes query in word. For example, you can find <code class="docutils literal"><span class="pre">world</span></code>
|
267
|
-
by <code class="docutils literal"><span class="pre">or</span></code>. This is more sensitive for ASCII only languages rather than
|
268
|
-
non-ASCII languages. <code class="docutils literal"><span class="pre">TokenBigram</span></code> has solution for this problem
|
269
|
-
described in the below.</p>
|
270
|
-
<p><code class="docutils literal"><span class="pre">TokenBigram</span></code> behavior is different when it's worked with any
|
271
|
-
<a class="reference internal" href="normalizers.html"><span class="doc">Normalizers</span></a>.</p>
|
272
|
-
<p>If no normalizer is used, <code class="docutils literal"><span class="pre">TokenBigram</span></code> uses pure bigram (all tokens
|
273
|
-
except the last token have two characters) tokenize method:</p>
|
274
|
-
<p>Execution example:</p>
|
275
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigram "Hello World"
|
276
|
-
# [
|
277
|
-
# [
|
278
|
-
# 0,
|
279
|
-
# 1337566253.89858,
|
280
|
-
# 0.000355720520019531
|
281
|
-
# ],
|
282
|
-
# [
|
283
|
-
# {
|
284
|
-
# "position": 0,
|
285
|
-
# "force_prefix": false,
|
286
|
-
# "value": "He"
|
287
|
-
# },
|
288
|
-
# {
|
289
|
-
# "position": 1,
|
290
|
-
# "force_prefix": false,
|
291
|
-
# "value": "el"
|
292
|
-
# },
|
293
|
-
# {
|
294
|
-
# "position": 2,
|
295
|
-
# "force_prefix": false,
|
296
|
-
# "value": "ll"
|
297
|
-
# },
|
298
|
-
# {
|
299
|
-
# "position": 3,
|
300
|
-
# "force_prefix": false,
|
301
|
-
# "value": "lo"
|
302
|
-
# },
|
303
|
-
# {
|
304
|
-
# "position": 4,
|
305
|
-
# "force_prefix": false,
|
306
|
-
# "value": "o "
|
307
|
-
# },
|
308
|
-
# {
|
309
|
-
# "position": 5,
|
310
|
-
# "force_prefix": false,
|
311
|
-
# "value": " W"
|
312
|
-
# },
|
313
|
-
# {
|
314
|
-
# "position": 6,
|
315
|
-
# "force_prefix": false,
|
316
|
-
# "value": "Wo"
|
317
|
-
# },
|
318
|
-
# {
|
319
|
-
# "position": 7,
|
320
|
-
# "force_prefix": false,
|
321
|
-
# "value": "or"
|
322
|
-
# },
|
323
|
-
# {
|
324
|
-
# "position": 8,
|
325
|
-
# "force_prefix": false,
|
326
|
-
# "value": "rl"
|
327
|
-
# },
|
328
|
-
# {
|
329
|
-
# "position": 9,
|
330
|
-
# "force_prefix": false,
|
331
|
-
# "value": "ld"
|
332
|
-
# },
|
333
|
-
# {
|
334
|
-
# "position": 10,
|
335
|
-
# "force_prefix": false,
|
336
|
-
# "value": "d"
|
337
|
-
# }
|
338
|
-
# ]
|
339
|
-
# ]
|
340
|
-
</pre></div>
|
341
|
-
</div>
|
342
|
-
<p>If normalizer is used, <code class="docutils literal"><span class="pre">TokenBigram</span></code> uses white-space-separate like
|
343
|
-
tokenize method for ASCII characters. <code class="docutils literal"><span class="pre">TokenBigram</span></code> uses bigram
|
344
|
-
tokenize method for non-ASCII characters.</p>
|
345
|
-
<p>You may be confused with this combined behavior. But it's reasonable
|
346
|
-
for most use cases such as English text (only ASCII characters) and
|
347
|
-
Japanese text (ASCII and non-ASCII characters are mixed).</p>
|
348
|
-
<p>Most languages consists of only ASCII characters use white-space for
|
349
|
-
word separator. White-space-separate tokenize method is suitable for
|
350
|
-
the case.</p>
|
351
|
-
<p>Languages consists of non-ASCII characters don't use white-space for
|
352
|
-
word separator. Bigram tokenize method is suitable for the case.</p>
|
353
|
-
<p>Mixed tokenize method is suitable for mixed language case.</p>
|
354
|
-
<p>If you want to use bigram tokenize method for ASCII character, see
|
355
|
-
<code class="docutils literal"><span class="pre">TokenBigramSplitXXX</span></code> type tokenizers such as
|
356
|
-
<a class="reference internal" href="#token-bigram-split-symbol-alpha"><span class="std std-ref">TokenBigramSplitSymbolAlpha</span></a>.</p>
|
357
|
-
<p>Let's confirm <code class="docutils literal"><span class="pre">TokenBigram</span></code> behavior by example.</p>
|
358
|
-
<p><code class="docutils literal"><span class="pre">TokenBigram</span></code> uses one or more white-spaces as token delimiter for
|
359
|
-
ASCII characters:</p>
|
360
|
-
<p>Execution example:</p>
|
361
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigram "Hello World" NormalizerAuto
|
362
|
-
# [
|
363
|
-
# [
|
364
|
-
# 0,
|
365
|
-
# 1337566253.89858,
|
366
|
-
# 0.000355720520019531
|
367
|
-
# ],
|
368
|
-
# [
|
369
|
-
# {
|
370
|
-
# "position": 0,
|
371
|
-
# "force_prefix": false,
|
372
|
-
# "value": "hello"
|
373
|
-
# },
|
374
|
-
# {
|
375
|
-
# "position": 1,
|
376
|
-
# "force_prefix": false,
|
377
|
-
# "value": "world"
|
378
|
-
# }
|
379
|
-
# ]
|
380
|
-
# ]
|
381
|
-
</pre></div>
|
382
|
-
</div>
|
383
|
-
<p><code class="docutils literal"><span class="pre">TokenBigram</span></code> uses character type change as token delimiter for
|
384
|
-
ASCII characters. Character type is one of them:</p>
|
385
|
-
<blockquote>
|
386
|
-
<div><ul class="simple">
|
387
|
-
<li>Alphabet</li>
|
388
|
-
<li>Digit</li>
|
389
|
-
<li>Symbol (such as <code class="docutils literal"><span class="pre">(</span></code>, <code class="docutils literal"><span class="pre">)</span></code> and <code class="docutils literal"><span class="pre">!</span></code>)</li>
|
390
|
-
<li>Hiragana</li>
|
391
|
-
<li>Katakana</li>
|
392
|
-
<li>Kanji</li>
|
393
|
-
<li>Others</li>
|
394
|
-
</ul>
|
395
|
-
</div></blockquote>
|
396
|
-
<p>The following example shows two token delimiters:</p>
|
397
|
-
<blockquote>
|
398
|
-
<div><ul class="simple">
|
399
|
-
<li>at between <code class="docutils literal"><span class="pre">100</span></code> (digits) and <code class="docutils literal"><span class="pre">cents</span></code> (alphabets)</li>
|
400
|
-
<li>at between <code class="docutils literal"><span class="pre">cents</span></code> (alphabets) and <code class="docutils literal"><span class="pre">!!!</span></code> (symbols)</li>
|
401
|
-
</ul>
|
402
|
-
</div></blockquote>
|
403
|
-
<p>Execution example:</p>
|
404
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigram "100cents!!!" NormalizerAuto
|
405
|
-
# [
|
406
|
-
# [
|
407
|
-
# 0,
|
408
|
-
# 1337566253.89858,
|
409
|
-
# 0.000355720520019531
|
410
|
-
# ],
|
411
|
-
# [
|
412
|
-
# {
|
413
|
-
# "position": 0,
|
414
|
-
# "force_prefix": false,
|
415
|
-
# "value": "100"
|
416
|
-
# },
|
417
|
-
# {
|
418
|
-
# "position": 1,
|
419
|
-
# "force_prefix": false,
|
420
|
-
# "value": "cents"
|
421
|
-
# },
|
422
|
-
# {
|
423
|
-
# "position": 2,
|
424
|
-
# "force_prefix": false,
|
425
|
-
# "value": "!!!"
|
426
|
-
# }
|
427
|
-
# ]
|
428
|
-
# ]
|
429
|
-
</pre></div>
|
430
|
-
</div>
|
431
|
-
<p>Here is an example that <code class="docutils literal"><span class="pre">TokenBigram</span></code> uses bigram tokenize method
|
432
|
-
for non-ASCII characters.</p>
|
433
|
-
<p>Execution example:</p>
|
434
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigram "日本語の勉強" NormalizerAuto
|
435
|
-
# [
|
436
|
-
# [
|
437
|
-
# 0,
|
438
|
-
# 1337566253.89858,
|
439
|
-
# 0.000355720520019531
|
440
|
-
# ],
|
441
|
-
# [
|
442
|
-
# {
|
443
|
-
# "position": 0,
|
444
|
-
# "force_prefix": false,
|
445
|
-
# "value": "日本"
|
446
|
-
# },
|
447
|
-
# {
|
448
|
-
# "position": 1,
|
449
|
-
# "force_prefix": false,
|
450
|
-
# "value": "本語"
|
451
|
-
# },
|
452
|
-
# {
|
453
|
-
# "position": 2,
|
454
|
-
# "force_prefix": false,
|
455
|
-
# "value": "語の"
|
456
|
-
# },
|
457
|
-
# {
|
458
|
-
# "position": 3,
|
459
|
-
# "force_prefix": false,
|
460
|
-
# "value": "の勉"
|
461
|
-
# },
|
462
|
-
# {
|
463
|
-
# "position": 4,
|
464
|
-
# "force_prefix": false,
|
465
|
-
# "value": "勉強"
|
466
|
-
# },
|
467
|
-
# {
|
468
|
-
# "position": 5,
|
469
|
-
# "force_prefix": false,
|
470
|
-
# "value": "強"
|
471
|
-
# }
|
472
|
-
# ]
|
473
|
-
# ]
|
474
|
-
</pre></div>
|
475
|
-
</div>
|
476
|
-
</div>
|
477
|
-
<div class="section" id="tokenbigramsplitsymbol">
|
478
|
-
<span id="token-bigram-split-symbol"></span><h3>7.8.3.2. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbol</span></code><a class="headerlink" href="#tokenbigramsplitsymbol" title="Permalink to this headline">¶</a></h3>
|
479
|
-
<p><code class="docutils literal"><span class="pre">TokenBigramSplitSymbol</span></code> is similar to <a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a>. The
|
480
|
-
difference between them is symbol handling. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbol</span></code>
|
481
|
-
tokenizes symbols by bigram tokenize method:</p>
|
482
|
-
<p>Execution example:</p>
|
483
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigramSplitSymbol "100cents!!!" NormalizerAuto
|
484
|
-
# [
|
485
|
-
# [
|
486
|
-
# 0,
|
487
|
-
# 1337566253.89858,
|
488
|
-
# 0.000355720520019531
|
489
|
-
# ],
|
490
|
-
# [
|
491
|
-
# {
|
492
|
-
# "position": 0,
|
493
|
-
# "force_prefix": false,
|
494
|
-
# "value": "100"
|
495
|
-
# },
|
496
|
-
# {
|
497
|
-
# "position": 1,
|
498
|
-
# "force_prefix": false,
|
499
|
-
# "value": "cents"
|
500
|
-
# },
|
501
|
-
# {
|
502
|
-
# "position": 2,
|
503
|
-
# "force_prefix": false,
|
504
|
-
# "value": "!!"
|
505
|
-
# },
|
506
|
-
# {
|
507
|
-
# "position": 3,
|
508
|
-
# "force_prefix": false,
|
509
|
-
# "value": "!!"
|
510
|
-
# },
|
511
|
-
# {
|
512
|
-
# "position": 4,
|
513
|
-
# "force_prefix": false,
|
514
|
-
# "value": "!"
|
515
|
-
# }
|
516
|
-
# ]
|
517
|
-
# ]
|
518
|
-
</pre></div>
|
519
|
-
</div>
|
520
|
-
</div>
|
521
|
-
<div class="section" id="tokenbigramsplitsymbolalpha">
|
522
|
-
<span id="token-bigram-split-symbol-alpha"></span><h3>7.8.3.3. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlpha</span></code><a class="headerlink" href="#tokenbigramsplitsymbolalpha" title="Permalink to this headline">¶</a></h3>
|
523
|
-
<p><code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlpha</span></code> is similar to <a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a>. The
|
524
|
-
difference between them is symbol and alphabet
|
525
|
-
handling. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlpha</span></code> tokenizes symbols and
|
526
|
-
alphabets by bigram tokenize method:</p>
|
527
|
-
<p>Execution example:</p>
|
528
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigramSplitSymbolAlpha "100cents!!!" NormalizerAuto
|
529
|
-
# [
|
530
|
-
# [
|
531
|
-
# 0,
|
532
|
-
# 1337566253.89858,
|
533
|
-
# 0.000355720520019531
|
534
|
-
# ],
|
535
|
-
# [
|
536
|
-
# {
|
537
|
-
# "position": 0,
|
538
|
-
# "force_prefix": false,
|
539
|
-
# "value": "100"
|
540
|
-
# },
|
541
|
-
# {
|
542
|
-
# "position": 1,
|
543
|
-
# "force_prefix": false,
|
544
|
-
# "value": "ce"
|
545
|
-
# },
|
546
|
-
# {
|
547
|
-
# "position": 2,
|
548
|
-
# "force_prefix": false,
|
549
|
-
# "value": "en"
|
550
|
-
# },
|
551
|
-
# {
|
552
|
-
# "position": 3,
|
553
|
-
# "force_prefix": false,
|
554
|
-
# "value": "nt"
|
555
|
-
# },
|
556
|
-
# {
|
557
|
-
# "position": 4,
|
558
|
-
# "force_prefix": false,
|
559
|
-
# "value": "ts"
|
560
|
-
# },
|
561
|
-
# {
|
562
|
-
# "position": 5,
|
563
|
-
# "force_prefix": false,
|
564
|
-
# "value": "s!"
|
565
|
-
# },
|
566
|
-
# {
|
567
|
-
# "position": 6,
|
568
|
-
# "force_prefix": false,
|
569
|
-
# "value": "!!"
|
570
|
-
# },
|
571
|
-
# {
|
572
|
-
# "position": 7,
|
573
|
-
# "force_prefix": false,
|
574
|
-
# "value": "!!"
|
575
|
-
# },
|
576
|
-
# {
|
577
|
-
# "position": 8,
|
578
|
-
# "force_prefix": false,
|
579
|
-
# "value": "!"
|
580
|
-
# }
|
581
|
-
# ]
|
582
|
-
# ]
|
583
|
-
</pre></div>
|
584
|
-
</div>
|
585
|
-
</div>
|
586
|
-
<div class="section" id="tokenbigramsplitsymbolalphadigit">
|
587
|
-
<span id="token-bigram-split-symbol-alpha-digit"></span><h3>7.8.3.4. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlphaDigit</span></code><a class="headerlink" href="#tokenbigramsplitsymbolalphadigit" title="Permalink to this headline">¶</a></h3>
|
588
|
-
<p><code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlphaDigit</span></code> is similar to
|
589
|
-
<a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a>. The difference between them is symbol, alphabet
|
590
|
-
and digit handling. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlphaDigit</span></code> tokenizes
|
591
|
-
symbols, alphabets and digits by bigram tokenize method. It means that
|
592
|
-
all characters are tokenized by bigram tokenize method:</p>
|
593
|
-
<p>Execution example:</p>
|
594
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigramSplitSymbolAlphaDigit "100cents!!!" NormalizerAuto
|
595
|
-
# [
|
596
|
-
# [
|
597
|
-
# 0,
|
598
|
-
# 1337566253.89858,
|
599
|
-
# 0.000355720520019531
|
600
|
-
# ],
|
601
|
-
# [
|
602
|
-
# {
|
603
|
-
# "position": 0,
|
604
|
-
# "force_prefix": false,
|
605
|
-
# "value": "10"
|
606
|
-
# },
|
607
|
-
# {
|
608
|
-
# "position": 1,
|
609
|
-
# "force_prefix": false,
|
610
|
-
# "value": "00"
|
611
|
-
# },
|
612
|
-
# {
|
613
|
-
# "position": 2,
|
614
|
-
# "force_prefix": false,
|
615
|
-
# "value": "0c"
|
616
|
-
# },
|
617
|
-
# {
|
618
|
-
# "position": 3,
|
619
|
-
# "force_prefix": false,
|
620
|
-
# "value": "ce"
|
621
|
-
# },
|
622
|
-
# {
|
623
|
-
# "position": 4,
|
624
|
-
# "force_prefix": false,
|
625
|
-
# "value": "en"
|
626
|
-
# },
|
627
|
-
# {
|
628
|
-
# "position": 5,
|
629
|
-
# "force_prefix": false,
|
630
|
-
# "value": "nt"
|
631
|
-
# },
|
632
|
-
# {
|
633
|
-
# "position": 6,
|
634
|
-
# "force_prefix": false,
|
635
|
-
# "value": "ts"
|
636
|
-
# },
|
637
|
-
# {
|
638
|
-
# "position": 7,
|
639
|
-
# "force_prefix": false,
|
640
|
-
# "value": "s!"
|
641
|
-
# },
|
642
|
-
# {
|
643
|
-
# "position": 8,
|
644
|
-
# "force_prefix": false,
|
645
|
-
# "value": "!!"
|
646
|
-
# },
|
647
|
-
# {
|
648
|
-
# "position": 9,
|
649
|
-
# "force_prefix": false,
|
650
|
-
# "value": "!!"
|
651
|
-
# },
|
652
|
-
# {
|
653
|
-
# "position": 10,
|
654
|
-
# "force_prefix": false,
|
655
|
-
# "value": "!"
|
656
|
-
# }
|
657
|
-
# ]
|
658
|
-
# ]
|
659
|
-
</pre></div>
|
660
|
-
</div>
|
661
|
-
</div>
|
662
|
-
<div class="section" id="tokenbigramignoreblank">
|
663
|
-
<span id="token-bigram-ignore-blank"></span><h3>7.8.3.5. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlank</span></code><a class="headerlink" href="#tokenbigramignoreblank" title="Permalink to this headline">¶</a></h3>
|
664
|
-
<p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlank</span></code> is similar to <a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a>. The
|
665
|
-
difference between them is blank handling. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlank</span></code>
|
666
|
-
ignores white-spaces in continuous symbols and non-ASCII characters.</p>
|
667
|
-
<p>You can find difference of them by <code class="docutils literal"><span class="pre">日</span> <span class="pre">本</span> <span class="pre">語</span> <span class="pre">!</span> <span class="pre">!</span> <span class="pre">!</span></code> text because it
|
668
|
-
has symbols and non-ASCII characters.</p>
|
669
|
-
<p>Here is a result by <a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a> :</p>
|
670
|
-
<p>Execution example:</p>
|
671
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigram "日 本 語 ! ! !" NormalizerAuto
|
672
|
-
# [
|
673
|
-
# [
|
674
|
-
# 0,
|
675
|
-
# 1337566253.89858,
|
676
|
-
# 0.000355720520019531
|
677
|
-
# ],
|
678
|
-
# [
|
679
|
-
# {
|
680
|
-
# "position": 0,
|
681
|
-
# "force_prefix": false,
|
682
|
-
# "value": "日"
|
683
|
-
# },
|
684
|
-
# {
|
685
|
-
# "position": 1,
|
686
|
-
# "force_prefix": false,
|
687
|
-
# "value": "本"
|
688
|
-
# },
|
689
|
-
# {
|
690
|
-
# "position": 2,
|
691
|
-
# "force_prefix": false,
|
692
|
-
# "value": "語"
|
693
|
-
# },
|
694
|
-
# {
|
695
|
-
# "position": 3,
|
696
|
-
# "force_prefix": false,
|
697
|
-
# "value": "!"
|
698
|
-
# },
|
699
|
-
# {
|
700
|
-
# "position": 4,
|
701
|
-
# "force_prefix": false,
|
702
|
-
# "value": "!"
|
703
|
-
# },
|
704
|
-
# {
|
705
|
-
# "position": 5,
|
706
|
-
# "force_prefix": false,
|
707
|
-
# "value": "!"
|
708
|
-
# }
|
709
|
-
# ]
|
710
|
-
# ]
|
711
|
-
</pre></div>
|
712
|
-
</div>
|
713
|
-
<p>Here is a result by <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlank</span></code>:</p>
|
714
|
-
<p>Execution example:</p>
|
715
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigramIgnoreBlank "日 本 語 ! ! !" NormalizerAuto
|
716
|
-
# [
|
717
|
-
# [
|
718
|
-
# 0,
|
719
|
-
# 1337566253.89858,
|
720
|
-
# 0.000355720520019531
|
721
|
-
# ],
|
722
|
-
# [
|
723
|
-
# {
|
724
|
-
# "position": 0,
|
725
|
-
# "force_prefix": false,
|
726
|
-
# "value": "日本"
|
727
|
-
# },
|
728
|
-
# {
|
729
|
-
# "position": 1,
|
730
|
-
# "force_prefix": false,
|
731
|
-
# "value": "本語"
|
732
|
-
# },
|
733
|
-
# {
|
734
|
-
# "position": 2,
|
735
|
-
# "force_prefix": false,
|
736
|
-
# "value": "語"
|
737
|
-
# },
|
738
|
-
# {
|
739
|
-
# "position": 3,
|
740
|
-
# "force_prefix": false,
|
741
|
-
# "value": "!!!"
|
742
|
-
# }
|
743
|
-
# ]
|
744
|
-
# ]
|
745
|
-
</pre></div>
|
746
|
-
</div>
|
747
|
-
</div>
|
748
|
-
<div class="section" id="tokenbigramignoreblanksplitsymbol">
|
749
|
-
<span id="token-bigram-ignore-blank-split-symbol"></span><h3>7.8.3.6. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbol</span></code><a class="headerlink" href="#tokenbigramignoreblanksplitsymbol" title="Permalink to this headline">¶</a></h3>
|
750
|
-
<p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbol</span></code> is similar to
|
751
|
-
<a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a>. The differences between them are the followings:</p>
|
752
|
-
<blockquote>
|
753
|
-
<div><ul class="simple">
|
754
|
-
<li>Blank handling</li>
|
755
|
-
<li>Symbol handling</li>
|
756
|
-
</ul>
|
757
|
-
</div></blockquote>
|
758
|
-
<p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbol</span></code> ignores white-spaces in
|
759
|
-
continuous symbols and non-ASCII characters.</p>
|
760
|
-
<p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbol</span></code> tokenizes symbols by bigram
|
761
|
-
tokenize method.</p>
|
762
|
-
<p>You can find difference of them by <code class="docutils literal"><span class="pre">日</span> <span class="pre">本</span> <span class="pre">語</span> <span class="pre">!</span> <span class="pre">!</span> <span class="pre">!</span></code> text because it
|
763
|
-
has symbols and non-ASCII characters.</p>
|
764
|
-
<p>Here is a result by <a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a> :</p>
|
765
|
-
<p>Execution example:</p>
|
766
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigram "日 本 語 ! ! !" NormalizerAuto
|
767
|
-
# [
|
768
|
-
# [
|
769
|
-
# 0,
|
770
|
-
# 1337566253.89858,
|
771
|
-
# 0.000355720520019531
|
772
|
-
# ],
|
773
|
-
# [
|
774
|
-
# {
|
775
|
-
# "position": 0,
|
776
|
-
# "force_prefix": false,
|
777
|
-
# "value": "日"
|
778
|
-
# },
|
779
|
-
# {
|
780
|
-
# "position": 1,
|
781
|
-
# "force_prefix": false,
|
782
|
-
# "value": "本"
|
783
|
-
# },
|
784
|
-
# {
|
785
|
-
# "position": 2,
|
786
|
-
# "force_prefix": false,
|
787
|
-
# "value": "語"
|
788
|
-
# },
|
789
|
-
# {
|
790
|
-
# "position": 3,
|
791
|
-
# "force_prefix": false,
|
792
|
-
# "value": "!"
|
793
|
-
# },
|
794
|
-
# {
|
795
|
-
# "position": 4,
|
796
|
-
# "force_prefix": false,
|
797
|
-
# "value": "!"
|
798
|
-
# },
|
799
|
-
# {
|
800
|
-
# "position": 5,
|
801
|
-
# "force_prefix": false,
|
802
|
-
# "value": "!"
|
803
|
-
# }
|
804
|
-
# ]
|
805
|
-
# ]
|
806
|
-
</pre></div>
|
807
|
-
</div>
|
808
|
-
<p>Here is a result by <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbol</span></code>:</p>
|
809
|
-
<p>Execution example:</p>
|
810
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigramIgnoreBlankSplitSymbol "日 本 語 ! ! !" NormalizerAuto
|
811
|
-
# [
|
812
|
-
# [
|
813
|
-
# 0,
|
814
|
-
# 1337566253.89858,
|
815
|
-
# 0.000355720520019531
|
816
|
-
# ],
|
817
|
-
# [
|
818
|
-
# {
|
819
|
-
# "position": 0,
|
820
|
-
# "force_prefix": false,
|
821
|
-
# "value": "日本"
|
822
|
-
# },
|
823
|
-
# {
|
824
|
-
# "position": 1,
|
825
|
-
# "force_prefix": false,
|
826
|
-
# "value": "本語"
|
827
|
-
# },
|
828
|
-
# {
|
829
|
-
# "position": 2,
|
830
|
-
# "force_prefix": false,
|
831
|
-
# "value": "語!"
|
832
|
-
# },
|
833
|
-
# {
|
834
|
-
# "position": 3,
|
835
|
-
# "force_prefix": false,
|
836
|
-
# "value": "!!"
|
837
|
-
# },
|
838
|
-
# {
|
839
|
-
# "position": 4,
|
840
|
-
# "force_prefix": false,
|
841
|
-
# "value": "!!"
|
842
|
-
# },
|
843
|
-
# {
|
844
|
-
# "position": 5,
|
845
|
-
# "force_prefix": false,
|
846
|
-
# "value": "!"
|
847
|
-
# }
|
848
|
-
# ]
|
849
|
-
# ]
|
850
|
-
</pre></div>
|
851
|
-
</div>
|
852
|
-
</div>
|
853
|
-
<div class="section" id="tokenbigramignoreblanksplitsymbolalpha">
|
854
|
-
<span id="token-bigram-ignore-blank-split-symbol-alpha"></span><h3>7.8.3.7. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlpha</span></code><a class="headerlink" href="#tokenbigramignoreblanksplitsymbolalpha" title="Permalink to this headline">¶</a></h3>
|
855
|
-
<p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlpha</span></code> is similar to
|
856
|
-
<a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a>. The differences between them are the followings:</p>
|
857
|
-
<blockquote>
|
858
|
-
<div><ul class="simple">
|
859
|
-
<li>Blank handling</li>
|
860
|
-
<li>Symbol and alphabet handling</li>
|
861
|
-
</ul>
|
862
|
-
</div></blockquote>
|
863
|
-
<p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlpha</span></code> ignores white-spaces in
|
864
|
-
continuous symbols and non-ASCII characters.</p>
|
865
|
-
<p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlpha</span></code> tokenizes symbols and
|
866
|
-
alphabets by bigram tokenize method.</p>
|
867
|
-
<p>You can find difference of them by <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">日</span> <span class="pre">本</span> <span class="pre">語</span> <span class="pre">!</span> <span class="pre">!</span> <span class="pre">!</span></code> text because it
|
868
|
-
has symbols and non-ASCII characters with white spaces and alphabets.</p>
|
869
|
-
<p>Here is a result by <a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a> :</p>
|
870
|
-
<p>Execution example:</p>
|
871
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigram "Hello 日 本 語 ! ! !" NormalizerAuto
|
872
|
-
# [
|
873
|
-
# [
|
874
|
-
# 0,
|
875
|
-
# 1337566253.89858,
|
876
|
-
# 0.000355720520019531
|
877
|
-
# ],
|
878
|
-
# [
|
879
|
-
# {
|
880
|
-
# "position": 0,
|
881
|
-
# "force_prefix": false,
|
882
|
-
# "value": "hello"
|
883
|
-
# },
|
884
|
-
# {
|
885
|
-
# "position": 1,
|
886
|
-
# "force_prefix": false,
|
887
|
-
# "value": "日"
|
888
|
-
# },
|
889
|
-
# {
|
890
|
-
# "position": 2,
|
891
|
-
# "force_prefix": false,
|
892
|
-
# "value": "本"
|
893
|
-
# },
|
894
|
-
# {
|
895
|
-
# "position": 3,
|
896
|
-
# "force_prefix": false,
|
897
|
-
# "value": "語"
|
898
|
-
# },
|
899
|
-
# {
|
900
|
-
# "position": 4,
|
901
|
-
# "force_prefix": false,
|
902
|
-
# "value": "!"
|
903
|
-
# },
|
904
|
-
# {
|
905
|
-
# "position": 5,
|
906
|
-
# "force_prefix": false,
|
907
|
-
# "value": "!"
|
908
|
-
# },
|
909
|
-
# {
|
910
|
-
# "position": 6,
|
911
|
-
# "force_prefix": false,
|
912
|
-
# "value": "!"
|
913
|
-
# }
|
914
|
-
# ]
|
915
|
-
# ]
|
916
|
-
</pre></div>
|
917
|
-
</div>
|
918
|
-
<p>Here is a result by <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlpha</span></code>:</p>
|
919
|
-
<p>Execution example:</p>
|
920
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigramIgnoreBlankSplitSymbolAlpha "Hello 日 本 語 ! ! !" NormalizerAuto
|
921
|
-
# [
|
922
|
-
# [
|
923
|
-
# 0,
|
924
|
-
# 1337566253.89858,
|
925
|
-
# 0.000355720520019531
|
926
|
-
# ],
|
927
|
-
# [
|
928
|
-
# {
|
929
|
-
# "position": 0,
|
930
|
-
# "force_prefix": false,
|
931
|
-
# "value": "he"
|
932
|
-
# },
|
933
|
-
# {
|
934
|
-
# "position": 1,
|
935
|
-
# "force_prefix": false,
|
936
|
-
# "value": "el"
|
937
|
-
# },
|
938
|
-
# {
|
939
|
-
# "position": 2,
|
940
|
-
# "force_prefix": false,
|
941
|
-
# "value": "ll"
|
942
|
-
# },
|
943
|
-
# {
|
944
|
-
# "position": 3,
|
945
|
-
# "force_prefix": false,
|
946
|
-
# "value": "lo"
|
947
|
-
# },
|
948
|
-
# {
|
949
|
-
# "position": 4,
|
950
|
-
# "force_prefix": false,
|
951
|
-
# "value": "o日"
|
952
|
-
# },
|
953
|
-
# {
|
954
|
-
# "position": 5,
|
955
|
-
# "force_prefix": false,
|
956
|
-
# "value": "日本"
|
957
|
-
# },
|
958
|
-
# {
|
959
|
-
# "position": 6,
|
960
|
-
# "force_prefix": false,
|
961
|
-
# "value": "本語"
|
962
|
-
# },
|
963
|
-
# {
|
964
|
-
# "position": 7,
|
965
|
-
# "force_prefix": false,
|
966
|
-
# "value": "語!"
|
967
|
-
# },
|
968
|
-
# {
|
969
|
-
# "position": 8,
|
970
|
-
# "force_prefix": false,
|
971
|
-
# "value": "!!"
|
972
|
-
# },
|
973
|
-
# {
|
974
|
-
# "position": 9,
|
975
|
-
# "force_prefix": false,
|
976
|
-
# "value": "!!"
|
977
|
-
# },
|
978
|
-
# {
|
979
|
-
# "position": 10,
|
980
|
-
# "force_prefix": false,
|
981
|
-
# "value": "!"
|
982
|
-
# }
|
983
|
-
# ]
|
984
|
-
# ]
|
985
|
-
</pre></div>
|
986
|
-
</div>
|
987
|
-
</div>
|
988
|
-
<div class="section" id="tokenbigramignoreblanksplitsymbolalphadigit">
|
989
|
-
<span id="token-bigram-ignore-blank-split-symbol-alpha-digit"></span><h3>7.8.3.8. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlphaDigit</span></code><a class="headerlink" href="#tokenbigramignoreblanksplitsymbolalphadigit" title="Permalink to this headline">¶</a></h3>
|
990
|
-
<p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlphaDigit</span></code> is similar to
|
991
|
-
<a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a>. The differences between them are the followings:</p>
|
992
|
-
<blockquote>
|
993
|
-
<div><ul class="simple">
|
994
|
-
<li>Blank handling</li>
|
995
|
-
<li>Symbol, alphabet and digit handling</li>
|
65
|
+
<div class="toctree-wrapper compound">
|
66
|
+
<ul>
|
67
|
+
<li class="toctree-l1"><a class="reference internal" href="tokenizer/summary.html">7.8.1. Summary</a></li>
|
68
|
+
<li class="toctree-l1"><a class="reference internal" href="tokenizers/token_bigram.html">7.8.2. <code class="docutils literal notranslate"><span class="pre">TokenBigram</span></code></a></li>
|
69
|
+
<li class="toctree-l1"><a class="reference internal" href="tokenizers/token_bigram_ignore_blank.html">7.8.3. <code class="docutils literal notranslate"><span class="pre">TokenBigramIgnoreBlank</span></code></a></li>
|
70
|
+
<li class="toctree-l1"><a class="reference internal" href="tokenizers/token_bigram_ignore_blank_split_symbol.html">7.8.4. <code class="docutils literal notranslate"><span class="pre">TokenBigramIgnoreBlankSplitSymbol</span></code></a></li>
|
71
|
+
<li class="toctree-l1"><a class="reference internal" href="tokenizers/token_bigram_ignore_blank_split_symbol_alpha.html">7.8.5. <code class="docutils literal notranslate"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlpha</span></code></a></li>
|
72
|
+
<li class="toctree-l1"><a class="reference internal" href="tokenizers/token_bigram_ignore_blank_split_symbol_alpha_digit.html">7.8.6. <code class="docutils literal notranslate"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlphaDigit</span></code></a></li>
|
73
|
+
<li class="toctree-l1"><a class="reference internal" href="tokenizers/token_bigram_split_symbol.html">7.8.7. <code class="docutils literal notranslate"><span class="pre">TokenBigramSplitSymbol</span></code></a></li>
|
74
|
+
<li class="toctree-l1"><a class="reference internal" href="tokenizers/token_bigram_split_symbol_alpha.html">7.8.8. <code class="docutils literal notranslate"><span class="pre">TokenBigramSplitSymbolAlpha</span></code></a></li>
|
75
|
+
<li class="toctree-l1"><a class="reference internal" href="tokenizers/token_bigram_split_symbol_alpha_digit.html">7.8.9. <code class="docutils literal notranslate"><span class="pre">TokenBigramSplitSymbolAlphaDigit</span></code></a></li>
|
76
|
+
<li class="toctree-l1"><a class="reference internal" href="tokenizers/token_delimit.html">7.8.10. <code class="docutils literal notranslate"><span class="pre">TokenDelimit</span></code></a></li>
|
77
|
+
<li class="toctree-l1"><a class="reference internal" href="tokenizers/token_delimit_null.html">7.8.11. <code class="docutils literal notranslate"><span class="pre">TokenDelimitNull</span></code></a></li>
|
78
|
+
<li class="toctree-l1"><a class="reference internal" href="tokenizers/token_mecab.html">7.8.12. <code class="docutils literal notranslate"><span class="pre">TokenMecab</span></code></a></li>
|
79
|
+
<li class="toctree-l1"><a class="reference internal" href="tokenizers/token_regexp.html">7.8.13. <code class="docutils literal notranslate"><span class="pre">TokenRegexp</span></code></a></li>
|
80
|
+
<li class="toctree-l1"><a class="reference internal" href="tokenizers/token_trigram.html">7.8.14. <code class="docutils literal notranslate"><span class="pre">TokenTrigram</span></code></a></li>
|
81
|
+
<li class="toctree-l1"><a class="reference internal" href="tokenizers/token_unigram.html">7.8.15. <code class="docutils literal notranslate"><span class="pre">TokenUnigram</span></code></a></li>
|
996
82
|
</ul>
|
997
|
-
</div></blockquote>
|
998
|
-
<p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlphaDigit</span></code> ignores white-spaces
|
999
|
-
in continuous symbols and non-ASCII characters.</p>
|
1000
|
-
<p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlphaDigit</span></code> tokenizes symbols,
|
1001
|
-
alphabets and digits by bigram tokenize method. It means that all
|
1002
|
-
characters are tokenized by bigram tokenize method.</p>
|
1003
|
-
<p>You can find difference of them by <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">日</span> <span class="pre">本</span> <span class="pre">語</span> <span class="pre">!</span> <span class="pre">!</span> <span class="pre">!</span> <span class="pre">777</span></code> text
|
1004
|
-
because it has symbols and non-ASCII characters with white spaces,
|
1005
|
-
alphabets and digits.</p>
|
1006
|
-
<p>Here is a result by <a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a> :</p>
|
1007
|
-
<p>Execution example:</p>
|
1008
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigram "Hello 日 本 語 ! ! ! 777" NormalizerAuto
|
1009
|
-
# [
|
1010
|
-
# [
|
1011
|
-
# 0,
|
1012
|
-
# 1337566253.89858,
|
1013
|
-
# 0.000355720520019531
|
1014
|
-
# ],
|
1015
|
-
# [
|
1016
|
-
# {
|
1017
|
-
# "position": 0,
|
1018
|
-
# "force_prefix": false,
|
1019
|
-
# "value": "hello"
|
1020
|
-
# },
|
1021
|
-
# {
|
1022
|
-
# "position": 1,
|
1023
|
-
# "force_prefix": false,
|
1024
|
-
# "value": "日"
|
1025
|
-
# },
|
1026
|
-
# {
|
1027
|
-
# "position": 2,
|
1028
|
-
# "force_prefix": false,
|
1029
|
-
# "value": "本"
|
1030
|
-
# },
|
1031
|
-
# {
|
1032
|
-
# "position": 3,
|
1033
|
-
# "force_prefix": false,
|
1034
|
-
# "value": "語"
|
1035
|
-
# },
|
1036
|
-
# {
|
1037
|
-
# "position": 4,
|
1038
|
-
# "force_prefix": false,
|
1039
|
-
# "value": "!"
|
1040
|
-
# },
|
1041
|
-
# {
|
1042
|
-
# "position": 5,
|
1043
|
-
# "force_prefix": false,
|
1044
|
-
# "value": "!"
|
1045
|
-
# },
|
1046
|
-
# {
|
1047
|
-
# "position": 6,
|
1048
|
-
# "force_prefix": false,
|
1049
|
-
# "value": "!"
|
1050
|
-
# },
|
1051
|
-
# {
|
1052
|
-
# "position": 7,
|
1053
|
-
# "force_prefix": false,
|
1054
|
-
# "value": "777"
|
1055
|
-
# }
|
1056
|
-
# ]
|
1057
|
-
# ]
|
1058
|
-
</pre></div>
|
1059
|
-
</div>
|
1060
|
-
<p>Here is a result by <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlphaDigit</span></code>:</p>
|
1061
|
-
<p>Execution example:</p>
|
1062
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenBigramIgnoreBlankSplitSymbolAlphaDigit "Hello 日 本 語 ! ! ! 777" NormalizerAuto
|
1063
|
-
# [
|
1064
|
-
# [
|
1065
|
-
# 0,
|
1066
|
-
# 1337566253.89858,
|
1067
|
-
# 0.000355720520019531
|
1068
|
-
# ],
|
1069
|
-
# [
|
1070
|
-
# {
|
1071
|
-
# "position": 0,
|
1072
|
-
# "force_prefix": false,
|
1073
|
-
# "value": "he"
|
1074
|
-
# },
|
1075
|
-
# {
|
1076
|
-
# "position": 1,
|
1077
|
-
# "force_prefix": false,
|
1078
|
-
# "value": "el"
|
1079
|
-
# },
|
1080
|
-
# {
|
1081
|
-
# "position": 2,
|
1082
|
-
# "force_prefix": false,
|
1083
|
-
# "value": "ll"
|
1084
|
-
# },
|
1085
|
-
# {
|
1086
|
-
# "position": 3,
|
1087
|
-
# "force_prefix": false,
|
1088
|
-
# "value": "lo"
|
1089
|
-
# },
|
1090
|
-
# {
|
1091
|
-
# "position": 4,
|
1092
|
-
# "force_prefix": false,
|
1093
|
-
# "value": "o日"
|
1094
|
-
# },
|
1095
|
-
# {
|
1096
|
-
# "position": 5,
|
1097
|
-
# "force_prefix": false,
|
1098
|
-
# "value": "日本"
|
1099
|
-
# },
|
1100
|
-
# {
|
1101
|
-
# "position": 6,
|
1102
|
-
# "force_prefix": false,
|
1103
|
-
# "value": "本語"
|
1104
|
-
# },
|
1105
|
-
# {
|
1106
|
-
# "position": 7,
|
1107
|
-
# "force_prefix": false,
|
1108
|
-
# "value": "語!"
|
1109
|
-
# },
|
1110
|
-
# {
|
1111
|
-
# "position": 8,
|
1112
|
-
# "force_prefix": false,
|
1113
|
-
# "value": "!!"
|
1114
|
-
# },
|
1115
|
-
# {
|
1116
|
-
# "position": 9,
|
1117
|
-
# "force_prefix": false,
|
1118
|
-
# "value": "!!"
|
1119
|
-
# },
|
1120
|
-
# {
|
1121
|
-
# "position": 10,
|
1122
|
-
# "force_prefix": false,
|
1123
|
-
# "value": "!7"
|
1124
|
-
# },
|
1125
|
-
# {
|
1126
|
-
# "position": 11,
|
1127
|
-
# "force_prefix": false,
|
1128
|
-
# "value": "77"
|
1129
|
-
# },
|
1130
|
-
# {
|
1131
|
-
# "position": 12,
|
1132
|
-
# "force_prefix": false,
|
1133
|
-
# "value": "77"
|
1134
|
-
# },
|
1135
|
-
# {
|
1136
|
-
# "position": 13,
|
1137
|
-
# "force_prefix": false,
|
1138
|
-
# "value": "7"
|
1139
|
-
# }
|
1140
|
-
# ]
|
1141
|
-
# ]
|
1142
|
-
</pre></div>
|
1143
|
-
</div>
|
1144
|
-
</div>
|
1145
|
-
<div class="section" id="tokenunigram">
|
1146
|
-
<span id="token-unigram"></span><h3>7.8.3.9. <code class="docutils literal"><span class="pre">TokenUnigram</span></code><a class="headerlink" href="#tokenunigram" title="Permalink to this headline">¶</a></h3>
|
1147
|
-
<p><code class="docutils literal"><span class="pre">TokenUnigram</span></code> is similar to <a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a>. The differences
|
1148
|
-
between them is token unit. <a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a> uses 2 characters per
|
1149
|
-
token. <code class="docutils literal"><span class="pre">TokenUnigram</span></code> uses 1 character per token.</p>
|
1150
|
-
<p>Execution example:</p>
|
1151
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenUnigram "100cents!!!" NormalizerAuto
|
1152
|
-
# [
|
1153
|
-
# [
|
1154
|
-
# 0,
|
1155
|
-
# 1337566253.89858,
|
1156
|
-
# 0.000355720520019531
|
1157
|
-
# ],
|
1158
|
-
# [
|
1159
|
-
# {
|
1160
|
-
# "position": 0,
|
1161
|
-
# "force_prefix": false,
|
1162
|
-
# "value": "100"
|
1163
|
-
# },
|
1164
|
-
# {
|
1165
|
-
# "position": 1,
|
1166
|
-
# "force_prefix": false,
|
1167
|
-
# "value": "cents"
|
1168
|
-
# },
|
1169
|
-
# {
|
1170
|
-
# "position": 2,
|
1171
|
-
# "force_prefix": false,
|
1172
|
-
# "value": "!!!"
|
1173
|
-
# }
|
1174
|
-
# ]
|
1175
|
-
# ]
|
1176
|
-
</pre></div>
|
1177
|
-
</div>
|
1178
|
-
</div>
|
1179
|
-
<div class="section" id="tokentrigram">
|
1180
|
-
<span id="token-trigram"></span><h3>7.8.3.10. <code class="docutils literal"><span class="pre">TokenTrigram</span></code><a class="headerlink" href="#tokentrigram" title="Permalink to this headline">¶</a></h3>
|
1181
|
-
<p><code class="docutils literal"><span class="pre">TokenTrigram</span></code> is similar to <a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a>. The differences
|
1182
|
-
between them is token unit. <a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a> uses 2 characters per
|
1183
|
-
token. <code class="docutils literal"><span class="pre">TokenTrigram</span></code> uses 3 characters per token.</p>
|
1184
|
-
<p>Execution example:</p>
|
1185
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenTrigram "10000cents!!!!!" NormalizerAuto
|
1186
|
-
# [
|
1187
|
-
# [
|
1188
|
-
# 0,
|
1189
|
-
# 1337566253.89858,
|
1190
|
-
# 0.000355720520019531
|
1191
|
-
# ],
|
1192
|
-
# [
|
1193
|
-
# {
|
1194
|
-
# "position": 0,
|
1195
|
-
# "force_prefix": false,
|
1196
|
-
# "value": "10000"
|
1197
|
-
# },
|
1198
|
-
# {
|
1199
|
-
# "position": 1,
|
1200
|
-
# "force_prefix": false,
|
1201
|
-
# "value": "cents"
|
1202
|
-
# },
|
1203
|
-
# {
|
1204
|
-
# "position": 2,
|
1205
|
-
# "force_prefix": false,
|
1206
|
-
# "value": "!!!!!"
|
1207
|
-
# }
|
1208
|
-
# ]
|
1209
|
-
# ]
|
1210
|
-
</pre></div>
|
1211
|
-
</div>
|
1212
|
-
</div>
|
1213
|
-
<div class="section" id="tokendelimit">
|
1214
|
-
<span id="token-delimit"></span><h3>7.8.3.11. <code class="docutils literal"><span class="pre">TokenDelimit</span></code><a class="headerlink" href="#tokendelimit" title="Permalink to this headline">¶</a></h3>
|
1215
|
-
<p><code class="docutils literal"><span class="pre">TokenDelimit</span></code> extracts token by splitting one or more space
|
1216
|
-
characters (<code class="docutils literal"><span class="pre">U+0020</span></code>). For example, <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> is tokenized to
|
1217
|
-
<code class="docutils literal"><span class="pre">Hello</span></code> and <code class="docutils literal"><span class="pre">World</span></code>.</p>
|
1218
|
-
<p><code class="docutils literal"><span class="pre">TokenDelimit</span></code> is suitable for tag text. You can extract <code class="docutils literal"><span class="pre">groonga</span></code>
|
1219
|
-
and <code class="docutils literal"><span class="pre">full-text-search</span></code> and <code class="docutils literal"><span class="pre">http</span></code> as tags from <code class="docutils literal"><span class="pre">groonga</span>
|
1220
|
-
<span class="pre">full-text-search</span> <span class="pre">http</span></code>.</p>
|
1221
|
-
<p>Here is an example of <code class="docutils literal"><span class="pre">TokenDelimit</span></code>:</p>
|
1222
|
-
<p>Execution example:</p>
|
1223
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenDelimit "Groonga full-text-search HTTP" NormalizerAuto
|
1224
|
-
# [
|
1225
|
-
# [
|
1226
|
-
# 0,
|
1227
|
-
# 1337566253.89858,
|
1228
|
-
# 0.000355720520019531
|
1229
|
-
# ],
|
1230
|
-
# [
|
1231
|
-
# {
|
1232
|
-
# "position": 0,
|
1233
|
-
# "force_prefix": false,
|
1234
|
-
# "value": "groonga"
|
1235
|
-
# },
|
1236
|
-
# {
|
1237
|
-
# "position": 1,
|
1238
|
-
# "force_prefix": false,
|
1239
|
-
# "value": "full-text-search"
|
1240
|
-
# },
|
1241
|
-
# {
|
1242
|
-
# "position": 2,
|
1243
|
-
# "force_prefix": false,
|
1244
|
-
# "value": "http"
|
1245
|
-
# }
|
1246
|
-
# ]
|
1247
|
-
# ]
|
1248
|
-
</pre></div>
|
1249
|
-
</div>
|
1250
|
-
</div>
|
1251
|
-
<div class="section" id="tokendelimitnull">
|
1252
|
-
<span id="token-delimit-null"></span><h3>7.8.3.12. <code class="docutils literal"><span class="pre">TokenDelimitNull</span></code><a class="headerlink" href="#tokendelimitnull" title="Permalink to this headline">¶</a></h3>
|
1253
|
-
<p><code class="docutils literal"><span class="pre">TokenDelimitNull</span></code> is similar to <a class="reference internal" href="#token-delimit"><span class="std std-ref">TokenDelimit</span></a>. The
|
1254
|
-
difference between them is separator character. <a class="reference internal" href="#token-delimit"><span class="std std-ref">TokenDelimit</span></a>
|
1255
|
-
uses space character (<code class="docutils literal"><span class="pre">U+0020</span></code>) but <code class="docutils literal"><span class="pre">TokenDelimitNull</span></code> uses NUL
|
1256
|
-
character (<code class="docutils literal"><span class="pre">U+0000</span></code>).</p>
|
1257
|
-
<p><code class="docutils literal"><span class="pre">TokenDelimitNull</span></code> is also suitable for tag text.</p>
|
1258
|
-
<p>Here is an example of <code class="docutils literal"><span class="pre">TokenDelimitNull</span></code>:</p>
|
1259
|
-
<p>Execution example:</p>
|
1260
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenDelimitNull "Groonga\u0000full-text-search\u0000HTTP" NormalizerAuto
|
1261
|
-
# [
|
1262
|
-
# [
|
1263
|
-
# 0,
|
1264
|
-
# 1337566253.89858,
|
1265
|
-
# 0.000355720520019531
|
1266
|
-
# ],
|
1267
|
-
# [
|
1268
|
-
# {
|
1269
|
-
# "position": 0,
|
1270
|
-
# "force_prefix": false,
|
1271
|
-
# "value": "groongau0000full-text-searchu0000http"
|
1272
|
-
# }
|
1273
|
-
# ]
|
1274
|
-
# ]
|
1275
|
-
</pre></div>
|
1276
|
-
</div>
|
1277
|
-
</div>
|
1278
|
-
<div class="section" id="tokenmecab">
|
1279
|
-
<span id="token-mecab"></span><h3>7.8.3.13. <code class="docutils literal"><span class="pre">TokenMecab</span></code><a class="headerlink" href="#tokenmecab" title="Permalink to this headline">¶</a></h3>
|
1280
|
-
<p><code class="docutils literal"><span class="pre">TokenMecab</span></code> is a tokenizer based on <a class="reference external" href="https://taku910.github.io/mecab/">MeCab</a> part-of-speech and
|
1281
|
-
morphological analyzer.</p>
|
1282
|
-
<p>MeCab doesn't depend on Japanese. You can use MeCab for other
|
1283
|
-
languages by creating dictionary for the languages. You can use <a class="reference external" href="http://osdn.jp/projects/naist-jdic/">NAIST
|
1284
|
-
Japanese Dictionary</a>
|
1285
|
-
for Japanese.</p>
|
1286
|
-
<p><code class="docutils literal"><span class="pre">TokenMecab</span></code> is good for precision rather than recall. You can find
|
1287
|
-
<code class="docutils literal"><span class="pre">東京都</span></code> and <code class="docutils literal"><span class="pre">京都</span></code> texts by <code class="docutils literal"><span class="pre">京都</span></code> query with
|
1288
|
-
<a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a> but <code class="docutils literal"><span class="pre">東京都</span></code> isn't expected. You can find only
|
1289
|
-
<code class="docutils literal"><span class="pre">京都</span></code> text by <code class="docutils literal"><span class="pre">京都</span></code> query with <code class="docutils literal"><span class="pre">TokenMecab</span></code>.</p>
|
1290
|
-
<p>If you want to support neologisms, you need to keep updating your
|
1291
|
-
MeCab dictionary. It needs maintain cost. (<a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a> doesn't
|
1292
|
-
require dictionary maintenance because <a class="reference internal" href="#token-bigram"><span class="std std-ref">TokenBigram</span></a> doesn't use
|
1293
|
-
dictionary.) <a class="reference external" href="https://github.com/neologd/mecab-ipadic-neologd">mecab-ipadic-NEologd : Neologism dictionary for MeCab</a> may help you.</p>
|
1294
|
-
<p>Here is an example of <code class="docutils literal"><span class="pre">TokenMeCab</span></code>. <code class="docutils literal"><span class="pre">東京都</span></code> is tokenized to <code class="docutils literal"><span class="pre">東京</span></code>
|
1295
|
-
and <code class="docutils literal"><span class="pre">都</span></code>. They don't include <code class="docutils literal"><span class="pre">京都</span></code>:</p>
|
1296
|
-
<p>Execution example:</p>
|
1297
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenMecab "東京都"
|
1298
|
-
# [
|
1299
|
-
# [
|
1300
|
-
# -22,
|
1301
|
-
# 1337566253.89858,
|
1302
|
-
# 0.000355720520019531,
|
1303
|
-
# "[tokenize] nonexistent tokenizer: <TokenMecab>",
|
1304
|
-
# [
|
1305
|
-
# [
|
1306
|
-
# "create_lexicon_for_tokenize",
|
1307
|
-
# "proc_tokenize.c",
|
1308
|
-
# 139
|
1309
|
-
# ]
|
1310
|
-
# ]
|
1311
|
-
# ]
|
1312
|
-
# ]
|
1313
|
-
</pre></div>
|
1314
|
-
</div>
|
1315
|
-
</div>
|
1316
|
-
<div class="section" id="tokenregexp">
|
1317
|
-
<span id="token-regexp"></span><h3>7.8.3.14. <code class="docutils literal"><span class="pre">TokenRegexp</span></code><a class="headerlink" href="#tokenregexp" title="Permalink to this headline">¶</a></h3>
|
1318
|
-
<div class="versionadded">
|
1319
|
-
<p><span class="versionmodified">New in version 5.0.1.</span></p>
|
1320
|
-
</div>
|
1321
|
-
<div class="admonition caution">
|
1322
|
-
<p class="first admonition-title">Caution</p>
|
1323
|
-
<p class="last">This tokenizer is experimental. Specification may be changed.</p>
|
1324
|
-
</div>
|
1325
|
-
<div class="admonition caution">
|
1326
|
-
<p class="first admonition-title">Caution</p>
|
1327
|
-
<p class="last">This tokenizer can be used only with UTF-8. You can't use this
|
1328
|
-
tokenizer with EUC-JP, Shift_JIS and so on.</p>
|
1329
|
-
</div>
|
1330
|
-
<p><code class="docutils literal"><span class="pre">TokenRegexp</span></code> is a tokenizer for supporting regular expression
|
1331
|
-
search by index.</p>
|
1332
|
-
<p>In general, regular expression search is evaluated as sequential
|
1333
|
-
search. But the following cases can be evaluated as index search:</p>
|
1334
|
-
<blockquote>
|
1335
|
-
<div><ul class="simple">
|
1336
|
-
<li>Literal only case such as <code class="docutils literal"><span class="pre">hello</span></code></li>
|
1337
|
-
<li>The beginning of text and literal case such as <code class="docutils literal"><span class="pre">\A/home/alice</span></code></li>
|
1338
|
-
<li>The end of text and literal case such as <code class="docutils literal"><span class="pre">\.txt\z</span></code></li>
|
1339
|
-
</ul>
|
1340
|
-
</div></blockquote>
|
1341
|
-
<p>In most cases, index search is faster than sequential search.</p>
|
1342
|
-
<p><code class="docutils literal"><span class="pre">TokenRegexp</span></code> is based on bigram tokenize method. <code class="docutils literal"><span class="pre">TokenRegexp</span></code>
|
1343
|
-
adds the beginning of text mark (<code class="docutils literal"><span class="pre">U+FFEF</span></code>) at the begging of text
|
1344
|
-
and the end of text mark (<code class="docutils literal"><span class="pre">U+FFF0</span></code>) to the end of text when you
|
1345
|
-
index text:</p>
|
1346
|
-
<p>Execution example:</p>
|
1347
|
-
<div class="highlight-none"><div class="highlight"><pre><span></span>tokenize TokenRegexp "/home/alice/test.txt" NormalizerAuto --mode ADD
|
1348
|
-
# [
|
1349
|
-
# [
|
1350
|
-
# 0,
|
1351
|
-
# 1337566253.89858,
|
1352
|
-
# 0.000355720520019531
|
1353
|
-
# ],
|
1354
|
-
# [
|
1355
|
-
# {
|
1356
|
-
# "position": 0,
|
1357
|
-
# "force_prefix": false,
|
1358
|
-
# "value": ""
|
1359
|
-
# },
|
1360
|
-
# {
|
1361
|
-
# "position": 1,
|
1362
|
-
# "force_prefix": false,
|
1363
|
-
# "value": "/h"
|
1364
|
-
# },
|
1365
|
-
# {
|
1366
|
-
# "position": 2,
|
1367
|
-
# "force_prefix": false,
|
1368
|
-
# "value": "ho"
|
1369
|
-
# },
|
1370
|
-
# {
|
1371
|
-
# "position": 3,
|
1372
|
-
# "force_prefix": false,
|
1373
|
-
# "value": "om"
|
1374
|
-
# },
|
1375
|
-
# {
|
1376
|
-
# "position": 4,
|
1377
|
-
# "force_prefix": false,
|
1378
|
-
# "value": "me"
|
1379
|
-
# },
|
1380
|
-
# {
|
1381
|
-
# "position": 5,
|
1382
|
-
# "force_prefix": false,
|
1383
|
-
# "value": "e/"
|
1384
|
-
# },
|
1385
|
-
# {
|
1386
|
-
# "position": 6,
|
1387
|
-
# "force_prefix": false,
|
1388
|
-
# "value": "/a"
|
1389
|
-
# },
|
1390
|
-
# {
|
1391
|
-
# "position": 7,
|
1392
|
-
# "force_prefix": false,
|
1393
|
-
# "value": "al"
|
1394
|
-
# },
|
1395
|
-
# {
|
1396
|
-
# "position": 8,
|
1397
|
-
# "force_prefix": false,
|
1398
|
-
# "value": "li"
|
1399
|
-
# },
|
1400
|
-
# {
|
1401
|
-
# "position": 9,
|
1402
|
-
# "force_prefix": false,
|
1403
|
-
# "value": "ic"
|
1404
|
-
# },
|
1405
|
-
# {
|
1406
|
-
# "position": 10,
|
1407
|
-
# "force_prefix": false,
|
1408
|
-
# "value": "ce"
|
1409
|
-
# },
|
1410
|
-
# {
|
1411
|
-
# "position": 11,
|
1412
|
-
# "force_prefix": false,
|
1413
|
-
# "value": "e/"
|
1414
|
-
# },
|
1415
|
-
# {
|
1416
|
-
# "position": 12,
|
1417
|
-
# "force_prefix": false,
|
1418
|
-
# "value": "/t"
|
1419
|
-
# },
|
1420
|
-
# {
|
1421
|
-
# "position": 13,
|
1422
|
-
# "force_prefix": false,
|
1423
|
-
# "value": "te"
|
1424
|
-
# },
|
1425
|
-
# {
|
1426
|
-
# "position": 14,
|
1427
|
-
# "force_prefix": false,
|
1428
|
-
# "value": "es"
|
1429
|
-
# },
|
1430
|
-
# {
|
1431
|
-
# "position": 15,
|
1432
|
-
# "force_prefix": false,
|
1433
|
-
# "value": "st"
|
1434
|
-
# },
|
1435
|
-
# {
|
1436
|
-
# "position": 16,
|
1437
|
-
# "force_prefix": false,
|
1438
|
-
# "value": "t."
|
1439
|
-
# },
|
1440
|
-
# {
|
1441
|
-
# "position": 17,
|
1442
|
-
# "force_prefix": false,
|
1443
|
-
# "value": ".t"
|
1444
|
-
# },
|
1445
|
-
# {
|
1446
|
-
# "position": 18,
|
1447
|
-
# "force_prefix": false,
|
1448
|
-
# "value": "tx"
|
1449
|
-
# },
|
1450
|
-
# {
|
1451
|
-
# "position": 19,
|
1452
|
-
# "force_prefix": false,
|
1453
|
-
# "value": "xt"
|
1454
|
-
# },
|
1455
|
-
# {
|
1456
|
-
# "position": 20,
|
1457
|
-
# "force_prefix": false,
|
1458
|
-
# "value": "t"
|
1459
|
-
# },
|
1460
|
-
# {
|
1461
|
-
# "position": 21,
|
1462
|
-
# "force_prefix": false,
|
1463
|
-
# "value": ""
|
1464
|
-
# }
|
1465
|
-
# ]
|
1466
|
-
# ]
|
1467
|
-
</pre></div>
|
1468
|
-
</div>
|
1469
|
-
</div>
|
1470
83
|
</div>
|
1471
84
|
</div>
|
1472
85
|
|
@@ -1476,46 +89,20 @@ index text:</p>
|
|
1476
89
|
</div>
|
1477
90
|
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
|
1478
91
|
<div class="sphinxsidebarwrapper">
|
1479
|
-
<h3><a href="../index.html">Table Of Contents</a></h3>
|
1480
|
-
<ul>
|
1481
|
-
<li><a class="reference internal" href="#">7.8. Tokenizers</a><ul>
|
1482
|
-
<li><a class="reference internal" href="#summary">7.8.1. Summary</a></li>
|
1483
|
-
<li><a class="reference internal" href="#what-is-tokenize">7.8.2. What is "tokenize"?</a></li>
|
1484
|
-
<li><a class="reference internal" href="#built-in-tokenizsers">7.8.3. Built-in tokenizsers</a><ul>
|
1485
|
-
<li><a class="reference internal" href="#tokenbigram">7.8.3.1. <code class="docutils literal"><span class="pre">TokenBigram</span></code></a></li>
|
1486
|
-
<li><a class="reference internal" href="#tokenbigramsplitsymbol">7.8.3.2. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbol</span></code></a></li>
|
1487
|
-
<li><a class="reference internal" href="#tokenbigramsplitsymbolalpha">7.8.3.3. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlpha</span></code></a></li>
|
1488
|
-
<li><a class="reference internal" href="#tokenbigramsplitsymbolalphadigit">7.8.3.4. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlphaDigit</span></code></a></li>
|
1489
|
-
<li><a class="reference internal" href="#tokenbigramignoreblank">7.8.3.5. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlank</span></code></a></li>
|
1490
|
-
<li><a class="reference internal" href="#tokenbigramignoreblanksplitsymbol">7.8.3.6. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbol</span></code></a></li>
|
1491
|
-
<li><a class="reference internal" href="#tokenbigramignoreblanksplitsymbolalpha">7.8.3.7. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlpha</span></code></a></li>
|
1492
|
-
<li><a class="reference internal" href="#tokenbigramignoreblanksplitsymbolalphadigit">7.8.3.8. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlphaDigit</span></code></a></li>
|
1493
|
-
<li><a class="reference internal" href="#tokenunigram">7.8.3.9. <code class="docutils literal"><span class="pre">TokenUnigram</span></code></a></li>
|
1494
|
-
<li><a class="reference internal" href="#tokentrigram">7.8.3.10. <code class="docutils literal"><span class="pre">TokenTrigram</span></code></a></li>
|
1495
|
-
<li><a class="reference internal" href="#tokendelimit">7.8.3.11. <code class="docutils literal"><span class="pre">TokenDelimit</span></code></a></li>
|
1496
|
-
<li><a class="reference internal" href="#tokendelimitnull">7.8.3.12. <code class="docutils literal"><span class="pre">TokenDelimitNull</span></code></a></li>
|
1497
|
-
<li><a class="reference internal" href="#tokenmecab">7.8.3.13. <code class="docutils literal"><span class="pre">TokenMecab</span></code></a></li>
|
1498
|
-
<li><a class="reference internal" href="#tokenregexp">7.8.3.14. <code class="docutils literal"><span class="pre">TokenRegexp</span></code></a></li>
|
1499
|
-
</ul>
|
1500
|
-
</li>
|
1501
|
-
</ul>
|
1502
|
-
</li>
|
1503
|
-
</ul>
|
1504
|
-
|
1505
92
|
<h4>Previous topic</h4>
|
1506
|
-
<p class="topless"><a href="normalizers.html"
|
1507
|
-
title="previous chapter">7.7.
|
93
|
+
<p class="topless"><a href="normalizers/normalizer_nfkc51.html"
|
94
|
+
title="previous chapter">7.7.2.3. <code class="docutils literal notranslate"><span class="pre">NormalizerNFKC51</span></code></a></p>
|
1508
95
|
<h4>Next topic</h4>
|
1509
|
-
<p class="topless"><a href="
|
1510
|
-
title="next chapter">7.
|
96
|
+
<p class="topless"><a href="tokenizer/summary.html"
|
97
|
+
title="next chapter">7.8.1. Summary</a></p>
|
1511
98
|
<div id="searchbox" style="display: none" role="search">
|
1512
99
|
<h3>Quick search</h3>
|
100
|
+
<div class="searchformwrapper">
|
1513
101
|
<form class="search" action="../search.html" method="get">
|
1514
|
-
<
|
1515
|
-
<
|
1516
|
-
<input type="hidden" name="check_keywords" value="yes" />
|
1517
|
-
<input type="hidden" name="area" value="default" />
|
102
|
+
<input type="text" name="q" />
|
103
|
+
<input type="submit" value="Go" />
|
1518
104
|
</form>
|
105
|
+
</div>
|
1519
106
|
</div>
|
1520
107
|
<script type="text/javascript">$('#searchbox').show(0);</script>
|
1521
108
|
</div>
|
@@ -1529,17 +116,17 @@ index text:</p>
|
|
1529
116
|
<a href="../genindex.html" title="General Index"
|
1530
117
|
>index</a></li>
|
1531
118
|
<li class="right" >
|
1532
|
-
<a href="
|
119
|
+
<a href="tokenizer/summary.html" title="7.8.1. Summary"
|
1533
120
|
>next</a> |</li>
|
1534
121
|
<li class="right" >
|
1535
|
-
<a href="normalizers.html" title="7.7.
|
122
|
+
<a href="normalizers/normalizer_nfkc51.html" title="7.7.2.3. NormalizerNFKC51"
|
1536
123
|
>previous</a> |</li>
|
1537
|
-
<li class="nav-item nav-item-0"><a href="../index.html">Groonga
|
124
|
+
<li class="nav-item nav-item-0"><a href="../index.html">Groonga v9.0.2 documentation</a> »</li>
|
1538
125
|
<li class="nav-item nav-item-1"><a href="../reference.html" >7. Reference manual</a> »</li>
|
1539
126
|
</ul>
|
1540
127
|
</div>
|
1541
128
|
<div class="footer" role="contentinfo">
|
1542
|
-
© Copyright 2009-
|
129
|
+
© Copyright 2009-2019, Brazil, Inc.
|
1543
130
|
</div>
|
1544
131
|
</body>
|
1545
132
|
</html>
|