rroonga 5.0.4-x86-mingw32 → 5.0.5-x86-mingw32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
 - data/README.md +2 -2
 - data/example/measure-data-column-disk-usage.rb +124 -0
 - data/example/measure-index-column-disk-usage.rb +81 -0
 - data/example/measure-table-disk-usage.rb +100 -0
 - data/ext/groonga/rb-grn-database.c +31 -0
 - data/ext/groonga/rb-grn-double-array-trie.c +1 -8
 - data/ext/groonga/rb-grn-logger.c +45 -0
 - data/ext/groonga/rb-grn-object.c +29 -1
 - data/ext/groonga/rb-grn-patricia-trie.c +1 -8
 - data/ext/groonga/rb-grn-table-cursor.c +8 -3
 - data/ext/groonga/rb-grn-table.c +10 -5
 - data/ext/groonga/rb-grn-thread.c +160 -0
 - data/ext/groonga/rb-grn-windows-event-logger.c +79 -0
 - data/ext/groonga/rb-grn.h +3 -1
 - data/ext/groonga/rb-groonga.c +3 -1
 - data/lib/1.9/groonga.so +0 -0
 - data/lib/2.0/groonga.so +0 -0
 - data/lib/2.1/groonga.so +0 -0
 - data/lib/2.2/groonga.so +0 -0
 - data/lib/groonga/dumper.rb +6 -1
 - data/rroonga-build.rb +4 -4
 - data/test/groonga-test-utils.rb +5 -1
 - data/test/test-database.rb +11 -0
 - data/test/test-logger.rb +6 -0
 - data/test/test-operator.rb +6 -6
 - data/test/test-procedure.rb +15 -0
 - data/test/test-table-dumper.rb +170 -1
 - data/test/test-thread.rb +42 -0
 - data/test/test-windows-event-logger.rb +28 -0
 - data/vendor/local/bin/grndb.exe +0 -0
 - data/vendor/local/bin/groonga-benchmark.exe +0 -0
 - data/vendor/local/bin/groonga.exe +0 -0
 - data/vendor/local/bin/libgcc_s_sjlj-1.dll +0 -0
 - data/vendor/local/bin/libgroonga-0.dll +0 -0
 - data/vendor/local/bin/libmecab-1.dll +0 -0
 - data/vendor/local/bin/libmsgpack-4.dll +0 -0
 - data/vendor/local/bin/libmsgpackc-2.dll +0 -0
 - data/vendor/local/bin/libonig-5.dll +0 -0
 - data/vendor/local/bin/libstdc++-6.dll +0 -0
 - data/vendor/local/bin/libwinpthread-1.dll +0 -0
 - data/vendor/local/bin/lz4.exe +0 -0
 - data/vendor/local/bin/lz4c.exe +0 -0
 - data/vendor/local/bin/lz4cat +0 -0
 - data/vendor/local/bin/mecab-config +2 -2
 - data/vendor/local/bin/mecab.exe +0 -0
 - data/vendor/local/bin/onig-config +1 -1
 - data/vendor/local/bin/zlib1.dll +0 -0
 - data/vendor/local/etc/groonga/httpd/groonga-httpd.conf +2 -2
 - data/vendor/local/etc/groonga/windows_event_log/provider.man +38 -0
 - data/vendor/local/include/groonga/groonga.h +2 -0
 - data/vendor/local/include/groonga/groonga/command.h +2 -0
 - data/vendor/local/include/groonga/groonga/groonga.h +5 -0
 - data/vendor/local/include/groonga/groonga/obj.h +1 -0
 - data/vendor/local/include/groonga/groonga/portability.h +16 -0
 - data/vendor/local/include/groonga/groonga/thread.h +42 -0
 - data/vendor/local/include/groonga/groonga/windows_event_logger.h +33 -0
 - data/vendor/local/lib/groonga/plugins/functions/vector.a +0 -0
 - data/vendor/local/lib/groonga/plugins/functions/vector.dll +0 -0
 - data/vendor/local/lib/groonga/plugins/functions/vector.dll.a +0 -0
 - data/vendor/local/lib/groonga/plugins/functions/vector.la +2 -2
 - data/vendor/local/lib/groonga/plugins/query_expanders/tsv.a +0 -0
 - data/vendor/local/lib/groonga/plugins/query_expanders/tsv.dll +0 -0
 - data/vendor/local/lib/groonga/plugins/query_expanders/tsv.dll.a +0 -0
 - data/vendor/local/lib/groonga/plugins/query_expanders/tsv.la +2 -2
 - data/vendor/local/lib/groonga/plugins/ruby/eval.a +0 -0
 - data/vendor/local/lib/groonga/plugins/ruby/eval.dll +0 -0
 - data/vendor/local/lib/groonga/plugins/ruby/eval.dll.a +0 -0
 - data/vendor/local/lib/groonga/plugins/ruby/eval.la +2 -2
 - data/vendor/local/lib/groonga/plugins/ruby/load.a +0 -0
 - data/vendor/local/lib/groonga/plugins/ruby/load.dll +0 -0
 - data/vendor/local/lib/groonga/plugins/ruby/load.dll.a +0 -0
 - data/vendor/local/lib/groonga/plugins/ruby/load.la +2 -2
 - data/vendor/local/lib/groonga/plugins/sharding.rb +5 -0
 - data/vendor/local/lib/groonga/plugins/sharding/logical_count.rb +43 -6
 - data/vendor/local/lib/groonga/plugins/sharding/logical_enumerator.rb +32 -25
 - data/vendor/local/lib/groonga/plugins/sharding/logical_parameters.rb +44 -0
 - data/vendor/local/lib/groonga/plugins/sharding/logical_range_filter.rb +217 -49
 - data/vendor/local/lib/groonga/plugins/sharding/logical_select.rb +507 -45
 - data/vendor/local/lib/groonga/plugins/sharding/logical_shard_list.rb +28 -0
 - data/vendor/local/lib/groonga/plugins/sharding/logical_table_remove.rb +11 -6
 - data/vendor/local/lib/groonga/plugins/sharding/parameters.rb +10 -0
 - data/vendor/local/lib/groonga/plugins/suggest/suggest.a +0 -0
 - data/vendor/local/lib/groonga/plugins/suggest/suggest.dll +0 -0
 - data/vendor/local/lib/groonga/plugins/suggest/suggest.dll.a +0 -0
 - data/vendor/local/lib/groonga/plugins/suggest/suggest.la +2 -2
 - data/vendor/local/lib/groonga/plugins/table/table.a +0 -0
 - data/vendor/local/lib/groonga/plugins/table/table.dll +0 -0
 - data/vendor/local/lib/groonga/plugins/table/table.dll.a +0 -0
 - data/vendor/local/lib/groonga/plugins/table/table.la +2 -2
 - data/vendor/local/lib/groonga/plugins/token_filters/stop_word.a +0 -0
 - data/vendor/local/lib/groonga/plugins/token_filters/stop_word.dll +0 -0
 - data/vendor/local/lib/groonga/plugins/token_filters/stop_word.dll.a +0 -0
 - data/vendor/local/lib/groonga/plugins/token_filters/stop_word.la +2 -2
 - data/vendor/local/lib/groonga/plugins/tokenizers/mecab.a +0 -0
 - data/vendor/local/lib/groonga/plugins/tokenizers/mecab.dll +0 -0
 - data/vendor/local/lib/groonga/plugins/tokenizers/mecab.dll.a +0 -0
 - data/vendor/local/lib/groonga/plugins/tokenizers/mecab.la +2 -2
 - data/vendor/local/lib/groonga/scripts/ruby/command.rb +31 -1
 - data/vendor/local/lib/groonga/scripts/ruby/context.rb +18 -2
 - data/vendor/local/lib/groonga/scripts/ruby/database.rb +12 -4
 - data/vendor/local/lib/groonga/scripts/ruby/expression_size_estimator.rb +31 -28
 - data/vendor/local/lib/groonga/scripts/ruby/initialize/post.rb +1 -0
 - data/vendor/local/lib/groonga/scripts/ruby/logger/level.rb +4 -2
 - data/vendor/local/lib/groonga/scripts/ruby/query_logger.rb +9 -0
 - data/vendor/local/lib/groonga/scripts/ruby/query_logger/flag.rb +39 -0
 - data/vendor/local/lib/groonga/scripts/ruby/record.rb +12 -0
 - data/vendor/local/lib/groonga/scripts/ruby/table.rb +35 -1
 - data/vendor/local/lib/libgroonga.a +0 -0
 - data/vendor/local/lib/libgroonga.dll.a +0 -0
 - data/vendor/local/lib/libgroonga.la +2 -2
 - data/vendor/local/lib/liblz4.dll +0 -0
 - data/vendor/local/lib/liblz4.dll.1 +0 -0
 - data/vendor/local/lib/liblz4.dll.1.5.0 +0 -0
 - data/vendor/local/lib/libmecab.a +0 -0
 - data/vendor/local/lib/libmecab.dll.a +0 -0
 - data/vendor/local/lib/libmecab.la +2 -2
 - data/vendor/local/lib/libmsgpack.a +0 -0
 - data/vendor/local/lib/libmsgpack.dll.a +0 -0
 - data/vendor/local/lib/libmsgpack.la +2 -2
 - data/vendor/local/lib/libmsgpackc.a +0 -0
 - data/vendor/local/lib/libmsgpackc.dll.a +0 -0
 - data/vendor/local/lib/libmsgpackc.la +2 -2
 - data/vendor/local/lib/libonig.a +0 -0
 - data/vendor/local/lib/libonig.dll.a +0 -0
 - data/vendor/local/lib/libonig.la +2 -2
 - data/vendor/local/lib/libz.a +0 -0
 - data/vendor/local/lib/libz.dll.a +0 -0
 - data/vendor/local/lib/pkgconfig/groonga.pc +3 -3
 - data/vendor/local/lib/pkgconfig/liblz4.pc +5 -5
 - data/vendor/local/lib/pkgconfig/msgpack.pc +1 -1
 - data/vendor/local/lib/pkgconfig/oniguruma.pc +6 -6
 - data/vendor/local/lib/pkgconfig/zlib.pc +3 -3
 - data/vendor/local/libexec/mecab/mecab-cost-train.exe +0 -0
 - data/vendor/local/libexec/mecab/mecab-dict-gen.exe +0 -0
 - data/vendor/local/libexec/mecab/mecab-dict-index.exe +0 -0
 - data/vendor/local/libexec/mecab/mecab-system-eval.exe +0 -0
 - data/vendor/local/libexec/mecab/mecab-test-gen.exe +0 -0
 - data/vendor/local/sbin/groonga-httpd-restart +1 -1
 - data/vendor/local/sbin/groonga-httpd.exe +0 -0
 - data/vendor/local/share/doc/groonga/en/html/.buildinfo +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/contribution/development.txt +3 -2
 - data/vendor/local/share/doc/groonga/en/html/_sources/contribution/development/build.txt +19 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/contribution/development/build/unix_autotools.txt +101 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/contribution/development/build/unix_cmake.txt +94 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/contribution/development/build/windows_cmake.txt +93 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/contribution/development/release.txt +16 -7
 - data/vendor/local/share/doc/groonga/en/html/_sources/contribution/development/repository.txt +7 -3
 - data/vendor/local/share/doc/groonga/en/html/_sources/contribution/development/test.txt +4 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/install/centos.txt +3 -3
 - data/vendor/local/share/doc/groonga/en/html/_sources/install/debian.txt +4 -4
 - data/vendor/local/share/doc/groonga/en/html/_sources/install/fedora.txt +3 -3
 - data/vendor/local/share/doc/groonga/en/html/_sources/install/mac_os_x.txt +3 -3
 - data/vendor/local/share/doc/groonga/en/html/_sources/install/others.txt +4 -4
 - data/vendor/local/share/doc/groonga/en/html/_sources/install/solaris.txt +3 -3
 - data/vendor/local/share/doc/groonga/en/html/_sources/install/ubuntu.txt +3 -4
 - data/vendor/local/share/doc/groonga/en/html/_sources/install/windows.txt +9 -9
 - data/vendor/local/share/doc/groonga/en/html/_sources/news.txt +319 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference.txt +1 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/api/grn_ctx.txt +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/api/grn_db.txt +23 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/api/grn_thread.txt +122 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/cache_limit.txt +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/column_copy.txt +381 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/column_list.txt +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/column_rename.txt +3 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/database_unmap.txt +85 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/io_flush.txt +218 -9
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/lock_clear.txt +1 -3
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/log_level.txt +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/logical_count.txt +3 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/logical_parameters.txt +138 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/logical_range_filter.txt +97 -10
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/logical_select.txt +745 -23
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/logical_shard_list.txt +107 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/logical_table_remove.txt +3 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/normalize.txt +2 -3
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/normalizer_list.txt +1 -2
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/object_exist.txt +90 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/plugin_register.txt +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/plugin_unregister.txt +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/register.txt +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/request_cancel.txt +1 -3
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/ruby_eval.txt +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/ruby_load.txt +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/select.txt +240 -56
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/table_create.txt +33 -7
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/table_rename.txt +90 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/table_tokenize.txt +2 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/thread_limit.txt +110 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/tokenize.txt +2 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/tokenizer_list.txt +1 -3
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/commands/truncate.txt +1 -3
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/executables/groonga-httpd.txt +3 -4
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/functions/highlight_full.txt +0 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/functions/highlight_html.txt +0 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/functions/query.txt +2 -2
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/functions/snippet_html.txt +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/grn_expr/query_syntax.txt +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/grn_expr/script_syntax.txt +34 -14
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/indexing.txt +2 -2
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/query_expanders/tsv.txt +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/regular_expression.txt +3 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/scoring_note.txt +2 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/sharding.txt +108 -0
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/tokenizers.txt +0 -21
 - data/vendor/local/share/doc/groonga/en/html/_sources/reference/tuning.txt +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/spec/search.txt +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_sources/troubleshooting/different_results_with_the_same_keyword.txt +4 -4
 - data/vendor/local/share/doc/groonga/en/html/_sources/tutorial/introduction.txt +24 -18
 - data/vendor/local/share/doc/groonga/en/html/_sources/tutorial/match_columns.txt +19 -19
 - data/vendor/local/share/doc/groonga/en/html/_sources/tutorial/micro_blog.txt +9 -9
 - data/vendor/local/share/doc/groonga/en/html/_sources/tutorial/query_expansion.txt +1 -1
 - data/vendor/local/share/doc/groonga/en/html/_static/basic.css +68 -6
 - data/vendor/local/share/doc/groonga/en/html/_static/doctools.js +27 -2
 - data/vendor/local/share/doc/groonga/en/html/_static/down-pressed.png +0 -0
 - data/vendor/local/share/doc/groonga/en/html/_static/down.png +0 -0
 - data/vendor/local/share/doc/groonga/en/html/_static/file.png +0 -0
 - data/vendor/local/share/doc/groonga/en/html/_static/jquery-1.11.1.js +10308 -0
 - data/vendor/local/share/doc/groonga/en/html/_static/jquery.js +4 -9404
 - data/vendor/local/share/doc/groonga/en/html/_static/minus.png +0 -0
 - data/vendor/local/share/doc/groonga/en/html/_static/plus.png +0 -0
 - data/vendor/local/share/doc/groonga/en/html/_static/searchtools.js +2 -2
 - data/vendor/local/share/doc/groonga/en/html/_static/underscore-1.3.1.js +999 -0
 - data/vendor/local/share/doc/groonga/en/html/_static/underscore.js +31 -1415
 - data/vendor/local/share/doc/groonga/en/html/_static/up-pressed.png +0 -0
 - data/vendor/local/share/doc/groonga/en/html/_static/up.png +0 -0
 - data/vendor/local/share/doc/groonga/en/html/_static/websupport.js +15 -15
 - data/vendor/local/share/doc/groonga/en/html/characteristic.html +19 -17
 - data/vendor/local/share/doc/groonga/en/html/client.html +19 -17
 - data/vendor/local/share/doc/groonga/en/html/community.html +19 -17
 - data/vendor/local/share/doc/groonga/en/html/contribution.html +78 -70
 - data/vendor/local/share/doc/groonga/en/html/contribution/development.html +30 -27
 - data/vendor/local/share/doc/groonga/en/html/contribution/development/build.html +146 -0
 - data/vendor/local/share/doc/groonga/en/html/contribution/development/build/unix_autotools.html +237 -0
 - data/vendor/local/share/doc/groonga/en/html/contribution/development/build/unix_cmake.html +227 -0
 - data/vendor/local/share/doc/groonga/en/html/contribution/development/build/windows_cmake.html +231 -0
 - data/vendor/local/share/doc/groonga/en/html/contribution/development/com.html +37 -35
 - data/vendor/local/share/doc/groonga/en/html/contribution/development/cooperation.html +54 -52
 - data/vendor/local/share/doc/groonga/en/html/contribution/development/query.html +80 -78
 - data/vendor/local/share/doc/groonga/en/html/contribution/development/release.html +135 -122
 - data/vendor/local/share/doc/groonga/en/html/contribution/development/repository.html +38 -34
 - data/vendor/local/share/doc/groonga/en/html/contribution/development/test.html +58 -54
 - data/vendor/local/share/doc/groonga/en/html/contribution/documentation.html +21 -19
 - data/vendor/local/share/doc/groonga/en/html/contribution/documentation/c-api.html +27 -25
 - data/vendor/local/share/doc/groonga/en/html/contribution/documentation/i18n.html +23 -21
 - data/vendor/local/share/doc/groonga/en/html/contribution/documentation/introduction.html +30 -28
 - data/vendor/local/share/doc/groonga/en/html/contribution/report.html +21 -19
 - data/vendor/local/share/doc/groonga/en/html/development.html +19 -17
 - data/vendor/local/share/doc/groonga/en/html/development/travis-ci.html +30 -28
 - data/vendor/local/share/doc/groonga/en/html/genindex.html +48 -20
 - data/vendor/local/share/doc/groonga/en/html/index.html +123 -105
 - data/vendor/local/share/doc/groonga/en/html/install.html +33 -31
 - data/vendor/local/share/doc/groonga/en/html/install/centos.html +32 -30
 - data/vendor/local/share/doc/groonga/en/html/install/debian.html +31 -29
 - data/vendor/local/share/doc/groonga/en/html/install/fedora.html +29 -27
 - data/vendor/local/share/doc/groonga/en/html/install/mac_os_x.html +26 -24
 - data/vendor/local/share/doc/groonga/en/html/install/others.html +92 -90
 - data/vendor/local/share/doc/groonga/en/html/install/solaris.html +26 -24
 - data/vendor/local/share/doc/groonga/en/html/install/ubuntu.html +29 -28
 - data/vendor/local/share/doc/groonga/en/html/install/windows.html +34 -32
 - data/vendor/local/share/doc/groonga/en/html/limitations.html +19 -17
 - data/vendor/local/share/doc/groonga/en/html/news.html +509 -142
 - data/vendor/local/share/doc/groonga/en/html/news/0.x.html +19 -17
 - data/vendor/local/share/doc/groonga/en/html/news/1.0.x.html +20 -18
 - data/vendor/local/share/doc/groonga/en/html/news/1.1.x.html +19 -17
 - data/vendor/local/share/doc/groonga/en/html/news/1.2.x.html +34 -32
 - data/vendor/local/share/doc/groonga/en/html/news/1.3.x.html +29 -27
 - data/vendor/local/share/doc/groonga/en/html/news/2.x.html +110 -108
 - data/vendor/local/share/doc/groonga/en/html/news/3.x.html +73 -71
 - data/vendor/local/share/doc/groonga/en/html/news/4.x.html +111 -109
 - data/vendor/local/share/doc/groonga/en/html/news/senna.html +19 -17
 - data/vendor/local/share/doc/groonga/en/html/objects.inv +0 -0
 - data/vendor/local/share/doc/groonga/en/html/reference.html +111 -94
 - data/vendor/local/share/doc/groonga/en/html/reference/api.html +55 -52
 - data/vendor/local/share/doc/groonga/en/html/reference/api/global_configurations.html +51 -49
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_cache.html +63 -61
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_column.html +84 -82
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_command_version.html +46 -44
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_content_type.html +41 -39
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_ctx.html +89 -87
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_db.html +88 -50
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_encoding.html +48 -46
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_expr.html +83 -81
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_geo.html +46 -44
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_hook.html +48 -46
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_ii.html +46 -44
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_index_cursor.html +45 -43
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_info.html +45 -43
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_match_escalation.html +44 -42
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_obj.html +93 -91
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_proc.html +48 -46
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_search.html +43 -41
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_table.html +79 -77
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_table_cursor.html +69 -67
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_thread.html +296 -0
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_type.html +45 -43
 - data/vendor/local/share/doc/groonga/en/html/reference/api/grn_user_data.html +38 -36
 - data/vendor/local/share/doc/groonga/en/html/reference/api/overview.html +51 -49
 - data/vendor/local/share/doc/groonga/en/html/reference/api/plugin.html +60 -58
 - data/vendor/local/share/doc/groonga/en/html/reference/cast.html +19 -17
 - data/vendor/local/share/doc/groonga/en/html/reference/column.html +21 -19
 - data/vendor/local/share/doc/groonga/en/html/reference/columns/index.html +23 -21
 - data/vendor/local/share/doc/groonga/en/html/reference/columns/pseudo.html +28 -26
 - data/vendor/local/share/doc/groonga/en/html/reference/columns/scalar.html +23 -21
 - data/vendor/local/share/doc/groonga/en/html/reference/columns/vector.html +72 -70
 - data/vendor/local/share/doc/groonga/en/html/reference/command.html +70 -61
 - data/vendor/local/share/doc/groonga/en/html/reference/command/command_version.html +23 -21
 - data/vendor/local/share/doc/groonga/en/html/reference/command/output_format.html +59 -57
 - data/vendor/local/share/doc/groonga/en/html/reference/command/request_id.html +28 -26
 - data/vendor/local/share/doc/groonga/en/html/reference/command/return_code.html +100 -98
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/cache_limit.html +44 -42
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/check.html +49 -47
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/clearlock.html +33 -31
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/column_copy.html +796 -0
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/column_create.html +62 -60
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/column_list.html +96 -94
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/column_remove.html +46 -44
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/column_rename.html +67 -64
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/database_unmap.html +236 -0
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/define_selector.html +62 -60
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/defrag.html +46 -44
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/delete.html +59 -57
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/dump.html +63 -61
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/io_flush.html +281 -54
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/load.html +66 -64
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/lock_clear.html +53 -52
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/log_level.html +48 -46
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/log_put.html +48 -46
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/log_reopen.html +48 -46
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/logical_count.html +78 -75
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/logical_parameters.html +283 -0
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/logical_range_filter.html +160 -85
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/logical_select.html +2071 -83
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/logical_shard_list.html +287 -0
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/logical_table_remove.html +71 -68
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/normalize.html +86 -84
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/normalizer_list.html +52 -50
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/object_exist.html +227 -0
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/plugin_register.html +57 -55
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/plugin_unregister.html +53 -51
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/quit.html +43 -41
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/range_filter.html +43 -41
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/register.html +56 -54
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/request_cancel.html +75 -74
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/ruby_eval.html +59 -57
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/ruby_load.html +59 -57
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/select.html +898 -647
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/shutdown.html +43 -41
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/status.html +50 -48
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/suggest.html +87 -85
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/table_create.html +175 -152
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/table_list.html +55 -53
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/table_remove.html +46 -44
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/table_rename.html +327 -0
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/table_tokenize.html +77 -75
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/thread_limit.html +241 -0
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/tokenize.html +108 -106
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/tokenizer_list.html +56 -51
 - data/vendor/local/share/doc/groonga/en/html/reference/commands/truncate.html +56 -55
 - data/vendor/local/share/doc/groonga/en/html/reference/executables.html +24 -22
 - data/vendor/local/share/doc/groonga/en/html/reference/executables/grndb.html +41 -39
 - data/vendor/local/share/doc/groonga/en/html/reference/executables/grnslap.html +28 -26
 - data/vendor/local/share/doc/groonga/en/html/reference/executables/groonga-benchmark.html +33 -31
 - data/vendor/local/share/doc/groonga/en/html/reference/executables/groonga-httpd.html +83 -81
 - data/vendor/local/share/doc/groonga/en/html/reference/executables/groonga-server-http.html +29 -27
 - data/vendor/local/share/doc/groonga/en/html/reference/executables/groonga-suggest-create-dataset.html +23 -21
 - data/vendor/local/share/doc/groonga/en/html/reference/executables/groonga-suggest-httpd.html +48 -46
 - data/vendor/local/share/doc/groonga/en/html/reference/executables/groonga-suggest-learner.html +40 -38
 - data/vendor/local/share/doc/groonga/en/html/reference/executables/groonga.html +84 -82
 - data/vendor/local/share/doc/groonga/en/html/reference/function.html +22 -20
 - data/vendor/local/share/doc/groonga/en/html/reference/functions/between.html +47 -45
 - data/vendor/local/share/doc/groonga/en/html/reference/functions/edit_distance.html +25 -23
 - data/vendor/local/share/doc/groonga/en/html/reference/functions/geo_distance.html +85 -83
 - data/vendor/local/share/doc/groonga/en/html/reference/functions/geo_in_circle.html +36 -34
 - data/vendor/local/share/doc/groonga/en/html/reference/functions/geo_in_rectangle.html +26 -24
 - data/vendor/local/share/doc/groonga/en/html/reference/functions/highlight_full.html +66 -64
 - data/vendor/local/share/doc/groonga/en/html/reference/functions/highlight_html.html +54 -52
 - data/vendor/local/share/doc/groonga/en/html/reference/functions/html_untag.html +35 -33
 - data/vendor/local/share/doc/groonga/en/html/reference/functions/in_values.html +34 -32
 - data/vendor/local/share/doc/groonga/en/html/reference/functions/now.html +23 -21
 - data/vendor/local/share/doc/groonga/en/html/reference/functions/query.html +55 -53
 - data/vendor/local/share/doc/groonga/en/html/reference/functions/rand.html +24 -22
 - data/vendor/local/share/doc/groonga/en/html/reference/functions/snippet_html.html +49 -47
 - data/vendor/local/share/doc/groonga/en/html/reference/functions/sub_filter.html +45 -43
 - data/vendor/local/share/doc/groonga/en/html/reference/functions/vector_size.html +33 -31
 - data/vendor/local/share/doc/groonga/en/html/reference/grn_expr.html +29 -27
 - data/vendor/local/share/doc/groonga/en/html/reference/grn_expr/query_syntax.html +244 -242
 - data/vendor/local/share/doc/groonga/en/html/reference/grn_expr/script_syntax.html +327 -325
 - data/vendor/local/share/doc/groonga/en/html/reference/indexing.html +35 -29
 - data/vendor/local/share/doc/groonga/en/html/reference/log.html +43 -41
 - data/vendor/local/share/doc/groonga/en/html/reference/normalizers.html +49 -47
 - data/vendor/local/share/doc/groonga/en/html/reference/operations.html +21 -19
 - data/vendor/local/share/doc/groonga/en/html/reference/operations/geolocation_search.html +23 -21
 - data/vendor/local/share/doc/groonga/en/html/reference/output.html +36 -34
 - data/vendor/local/share/doc/groonga/en/html/reference/query_expanders.html +21 -19
 - data/vendor/local/share/doc/groonga/en/html/reference/query_expanders/tsv.html +62 -60
 - data/vendor/local/share/doc/groonga/en/html/reference/regular_expression.html +107 -103
 - data/vendor/local/share/doc/groonga/en/html/reference/scorer.html +50 -40
 - data/vendor/local/share/doc/groonga/en/html/reference/scorers/scorer_tf_at_most.html +27 -25
 - data/vendor/local/share/doc/groonga/en/html/reference/scorers/scorer_tf_idf.html +41 -31
 - data/vendor/local/share/doc/groonga/en/html/reference/scoring_note.html +19 -17
 - data/vendor/local/share/doc/groonga/en/html/reference/sharding.html +241 -0
 - data/vendor/local/share/doc/groonga/en/html/reference/suggest.html +21 -19
 - data/vendor/local/share/doc/groonga/en/html/reference/suggest/completion.html +30 -28
 - data/vendor/local/share/doc/groonga/en/html/reference/suggest/correction.html +23 -21
 - data/vendor/local/share/doc/groonga/en/html/reference/suggest/introduction.html +23 -21
 - data/vendor/local/share/doc/groonga/en/html/reference/suggest/suggestion.html +23 -21
 - data/vendor/local/share/doc/groonga/en/html/reference/tables.html +56 -54
 - data/vendor/local/share/doc/groonga/en/html/reference/token_filters.html +41 -39
 - data/vendor/local/share/doc/groonga/en/html/reference/tokenizers.html +341 -289
 - data/vendor/local/share/doc/groonga/en/html/reference/tuning.html +68 -66
 - data/vendor/local/share/doc/groonga/en/html/reference/types.html +43 -41
 - data/vendor/local/share/doc/groonga/en/html/search.html +11 -11
 - data/vendor/local/share/doc/groonga/en/html/searchindex.js +1 -1
 - data/vendor/local/share/doc/groonga/en/html/server.html +19 -17
 - data/vendor/local/share/doc/groonga/en/html/server/gqtp.html +22 -20
 - data/vendor/local/share/doc/groonga/en/html/server/http.html +21 -19
 - data/vendor/local/share/doc/groonga/en/html/server/http/comparison.html +37 -35
 - data/vendor/local/share/doc/groonga/en/html/server/http/groonga-httpd.html +23 -21
 - data/vendor/local/share/doc/groonga/en/html/server/http/groonga.html +23 -21
 - data/vendor/local/share/doc/groonga/en/html/server/memcached.html +21 -19
 - data/vendor/local/share/doc/groonga/en/html/server/package.html +39 -37
 - data/vendor/local/share/doc/groonga/en/html/spec.html +23 -21
 - data/vendor/local/share/doc/groonga/en/html/spec/gqtp.html +129 -127
 - data/vendor/local/share/doc/groonga/en/html/spec/search.html +22 -20
 - data/vendor/local/share/doc/groonga/en/html/troubleshooting.html +19 -17
 - data/vendor/local/share/doc/groonga/en/html/troubleshooting/different_results_with_the_same_keyword.html +25 -23
 - data/vendor/local/share/doc/groonga/en/html/troubleshooting/mmap_cannot_allocate_memory.html +22 -20
 - data/vendor/local/share/doc/groonga/en/html/tutorial.html +20 -18
 - data/vendor/local/share/doc/groonga/en/html/tutorial/data.html +29 -23
 - data/vendor/local/share/doc/groonga/en/html/tutorial/drilldown.html +35 -33
 - data/vendor/local/share/doc/groonga/en/html/tutorial/index.html +25 -23
 - data/vendor/local/share/doc/groonga/en/html/tutorial/introduction.html +46 -39
 - data/vendor/local/share/doc/groonga/en/html/tutorial/lexicon.html +21 -19
 - data/vendor/local/share/doc/groonga/en/html/tutorial/match_columns.html +65 -63
 - data/vendor/local/share/doc/groonga/en/html/tutorial/micro_blog.html +55 -53
 - data/vendor/local/share/doc/groonga/en/html/tutorial/network.html +25 -23
 - data/vendor/local/share/doc/groonga/en/html/tutorial/patricia_trie.html +22 -20
 - data/vendor/local/share/doc/groonga/en/html/tutorial/query_expansion.html +25 -23
 - data/vendor/local/share/doc/groonga/en/html/tutorial/search.html +37 -35
 - data/vendor/local/share/doc/groonga/ja/html/.buildinfo +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/contribution/development.txt +3 -2
 - data/vendor/local/share/doc/groonga/ja/html/_sources/contribution/development/build.txt +19 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/contribution/development/build/unix_autotools.txt +101 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/contribution/development/build/unix_cmake.txt +94 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/contribution/development/build/windows_cmake.txt +93 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/contribution/development/release.txt +16 -7
 - data/vendor/local/share/doc/groonga/ja/html/_sources/contribution/development/repository.txt +7 -3
 - data/vendor/local/share/doc/groonga/ja/html/_sources/contribution/development/test.txt +4 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/install/centos.txt +3 -3
 - data/vendor/local/share/doc/groonga/ja/html/_sources/install/debian.txt +4 -4
 - data/vendor/local/share/doc/groonga/ja/html/_sources/install/fedora.txt +3 -3
 - data/vendor/local/share/doc/groonga/ja/html/_sources/install/mac_os_x.txt +3 -3
 - data/vendor/local/share/doc/groonga/ja/html/_sources/install/others.txt +4 -4
 - data/vendor/local/share/doc/groonga/ja/html/_sources/install/solaris.txt +3 -3
 - data/vendor/local/share/doc/groonga/ja/html/_sources/install/ubuntu.txt +3 -4
 - data/vendor/local/share/doc/groonga/ja/html/_sources/install/windows.txt +9 -9
 - data/vendor/local/share/doc/groonga/ja/html/_sources/news.txt +319 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference.txt +1 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/api/grn_ctx.txt +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/api/grn_db.txt +23 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/api/grn_thread.txt +122 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/cache_limit.txt +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/column_copy.txt +381 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/column_list.txt +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/column_rename.txt +3 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/database_unmap.txt +85 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/io_flush.txt +218 -9
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/lock_clear.txt +1 -3
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/log_level.txt +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/logical_count.txt +3 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/logical_parameters.txt +138 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/logical_range_filter.txt +97 -10
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/logical_select.txt +745 -23
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/logical_shard_list.txt +107 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/logical_table_remove.txt +3 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/normalize.txt +2 -3
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/normalizer_list.txt +1 -2
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/object_exist.txt +90 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/plugin_register.txt +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/plugin_unregister.txt +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/register.txt +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/request_cancel.txt +1 -3
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/ruby_eval.txt +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/ruby_load.txt +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/select.txt +240 -56
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/table_create.txt +33 -7
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/table_rename.txt +90 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/table_tokenize.txt +2 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/thread_limit.txt +110 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/tokenize.txt +2 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/tokenizer_list.txt +1 -3
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/commands/truncate.txt +1 -3
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/executables/groonga-httpd.txt +3 -4
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/functions/highlight_full.txt +0 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/functions/highlight_html.txt +0 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/functions/query.txt +2 -2
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/functions/snippet_html.txt +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/grn_expr/query_syntax.txt +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/grn_expr/script_syntax.txt +34 -14
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/indexing.txt +2 -2
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/query_expanders/tsv.txt +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/regular_expression.txt +3 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/scoring_note.txt +2 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/sharding.txt +108 -0
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/tokenizers.txt +0 -21
 - data/vendor/local/share/doc/groonga/ja/html/_sources/reference/tuning.txt +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/spec/search.txt +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_sources/troubleshooting/different_results_with_the_same_keyword.txt +4 -4
 - data/vendor/local/share/doc/groonga/ja/html/_sources/tutorial/introduction.txt +24 -18
 - data/vendor/local/share/doc/groonga/ja/html/_sources/tutorial/match_columns.txt +19 -19
 - data/vendor/local/share/doc/groonga/ja/html/_sources/tutorial/micro_blog.txt +9 -9
 - data/vendor/local/share/doc/groonga/ja/html/_sources/tutorial/query_expansion.txt +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/_static/basic.css +68 -6
 - data/vendor/local/share/doc/groonga/ja/html/_static/doctools.js +27 -2
 - data/vendor/local/share/doc/groonga/ja/html/_static/down-pressed.png +0 -0
 - data/vendor/local/share/doc/groonga/ja/html/_static/down.png +0 -0
 - data/vendor/local/share/doc/groonga/ja/html/_static/file.png +0 -0
 - data/vendor/local/share/doc/groonga/ja/html/_static/jquery-1.11.1.js +10308 -0
 - data/vendor/local/share/doc/groonga/ja/html/_static/jquery.js +4 -9404
 - data/vendor/local/share/doc/groonga/ja/html/_static/minus.png +0 -0
 - data/vendor/local/share/doc/groonga/ja/html/_static/plus.png +0 -0
 - data/vendor/local/share/doc/groonga/ja/html/_static/searchtools.js +2 -2
 - data/vendor/local/share/doc/groonga/ja/html/_static/underscore-1.3.1.js +999 -0
 - data/vendor/local/share/doc/groonga/ja/html/_static/underscore.js +31 -1415
 - data/vendor/local/share/doc/groonga/ja/html/_static/up-pressed.png +0 -0
 - data/vendor/local/share/doc/groonga/ja/html/_static/up.png +0 -0
 - data/vendor/local/share/doc/groonga/ja/html/_static/websupport.js +15 -15
 - data/vendor/local/share/doc/groonga/ja/html/characteristic.html +19 -17
 - data/vendor/local/share/doc/groonga/ja/html/client.html +19 -17
 - data/vendor/local/share/doc/groonga/ja/html/community.html +19 -17
 - data/vendor/local/share/doc/groonga/ja/html/contribution.html +77 -69
 - data/vendor/local/share/doc/groonga/ja/html/contribution/development.html +30 -27
 - data/vendor/local/share/doc/groonga/ja/html/contribution/development/build.html +144 -0
 - data/vendor/local/share/doc/groonga/ja/html/contribution/development/build/unix_autotools.html +226 -0
 - data/vendor/local/share/doc/groonga/ja/html/contribution/development/build/unix_cmake.html +215 -0
 - data/vendor/local/share/doc/groonga/ja/html/contribution/development/build/windows_cmake.html +229 -0
 - data/vendor/local/share/doc/groonga/ja/html/contribution/development/com.html +36 -34
 - data/vendor/local/share/doc/groonga/ja/html/contribution/development/cooperation.html +53 -51
 - data/vendor/local/share/doc/groonga/ja/html/contribution/development/query.html +79 -77
 - data/vendor/local/share/doc/groonga/ja/html/contribution/development/release.html +134 -121
 - data/vendor/local/share/doc/groonga/ja/html/contribution/development/repository.html +29 -27
 - data/vendor/local/share/doc/groonga/ja/html/contribution/development/test.html +57 -53
 - data/vendor/local/share/doc/groonga/ja/html/contribution/documentation.html +21 -19
 - data/vendor/local/share/doc/groonga/ja/html/contribution/documentation/c-api.html +23 -21
 - data/vendor/local/share/doc/groonga/ja/html/contribution/documentation/i18n.html +23 -21
 - data/vendor/local/share/doc/groonga/ja/html/contribution/documentation/introduction.html +30 -28
 - data/vendor/local/share/doc/groonga/ja/html/contribution/report.html +21 -19
 - data/vendor/local/share/doc/groonga/ja/html/development.html +19 -17
 - data/vendor/local/share/doc/groonga/ja/html/development/travis-ci.html +25 -23
 - data/vendor/local/share/doc/groonga/ja/html/genindex.html +48 -20
 - data/vendor/local/share/doc/groonga/ja/html/index.html +122 -104
 - data/vendor/local/share/doc/groonga/ja/html/install.html +33 -31
 - data/vendor/local/share/doc/groonga/ja/html/install/centos.html +35 -33
 - data/vendor/local/share/doc/groonga/ja/html/install/debian.html +33 -31
 - data/vendor/local/share/doc/groonga/ja/html/install/fedora.html +30 -28
 - data/vendor/local/share/doc/groonga/ja/html/install/mac_os_x.html +26 -24
 - data/vendor/local/share/doc/groonga/ja/html/install/others.html +83 -81
 - data/vendor/local/share/doc/groonga/ja/html/install/solaris.html +25 -23
 - data/vendor/local/share/doc/groonga/ja/html/install/ubuntu.html +30 -29
 - data/vendor/local/share/doc/groonga/ja/html/install/windows.html +33 -31
 - data/vendor/local/share/doc/groonga/ja/html/limitations.html +19 -17
 - data/vendor/local/share/doc/groonga/ja/html/news.html +460 -126
 - data/vendor/local/share/doc/groonga/ja/html/news/0.x.html +19 -17
 - data/vendor/local/share/doc/groonga/ja/html/news/1.0.x.html +20 -18
 - data/vendor/local/share/doc/groonga/ja/html/news/1.1.x.html +19 -17
 - data/vendor/local/share/doc/groonga/ja/html/news/1.2.x.html +34 -32
 - data/vendor/local/share/doc/groonga/ja/html/news/1.3.x.html +29 -27
 - data/vendor/local/share/doc/groonga/ja/html/news/2.x.html +102 -100
 - data/vendor/local/share/doc/groonga/ja/html/news/3.x.html +66 -64
 - data/vendor/local/share/doc/groonga/ja/html/news/4.x.html +91 -89
 - data/vendor/local/share/doc/groonga/ja/html/news/senna.html +19 -17
 - data/vendor/local/share/doc/groonga/ja/html/objects.inv +0 -0
 - data/vendor/local/share/doc/groonga/ja/html/reference.html +111 -94
 - data/vendor/local/share/doc/groonga/ja/html/reference/api.html +55 -52
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/global_configurations.html +51 -49
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_cache.html +58 -56
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_column.html +84 -82
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_command_version.html +46 -44
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_content_type.html +41 -39
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_ctx.html +85 -83
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_db.html +88 -50
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_encoding.html +48 -46
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_expr.html +78 -76
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_geo.html +46 -44
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_hook.html +48 -46
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_ii.html +46 -44
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_index_cursor.html +45 -43
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_info.html +45 -43
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_match_escalation.html +44 -42
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_obj.html +93 -91
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_proc.html +48 -46
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_search.html +43 -41
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_table.html +79 -77
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_table_cursor.html +69 -67
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_thread.html +286 -0
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_type.html +45 -43
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/grn_user_data.html +38 -36
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/overview.html +48 -46
 - data/vendor/local/share/doc/groonga/ja/html/reference/api/plugin.html +60 -58
 - data/vendor/local/share/doc/groonga/ja/html/reference/cast.html +19 -17
 - data/vendor/local/share/doc/groonga/ja/html/reference/column.html +21 -19
 - data/vendor/local/share/doc/groonga/ja/html/reference/columns/index.html +23 -21
 - data/vendor/local/share/doc/groonga/ja/html/reference/columns/pseudo.html +28 -26
 - data/vendor/local/share/doc/groonga/ja/html/reference/columns/scalar.html +23 -21
 - data/vendor/local/share/doc/groonga/ja/html/reference/columns/vector.html +61 -59
 - data/vendor/local/share/doc/groonga/ja/html/reference/command.html +70 -61
 - data/vendor/local/share/doc/groonga/ja/html/reference/command/command_version.html +23 -21
 - data/vendor/local/share/doc/groonga/ja/html/reference/command/output_format.html +49 -47
 - data/vendor/local/share/doc/groonga/ja/html/reference/command/request_id.html +28 -26
 - data/vendor/local/share/doc/groonga/ja/html/reference/command/return_code.html +99 -97
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/cache_limit.html +41 -39
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/check.html +49 -47
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/clearlock.html +33 -31
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/column_copy.html +781 -0
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/column_create.html +62 -60
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/column_list.html +88 -86
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/column_remove.html +46 -44
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/column_rename.html +66 -63
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/database_unmap.html +229 -0
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/define_selector.html +62 -60
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/defrag.html +46 -44
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/delete.html +58 -56
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/dump.html +63 -61
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/io_flush.html +266 -54
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/load.html +59 -57
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/lock_clear.html +53 -52
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/log_level.html +48 -46
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/log_put.html +48 -46
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/log_reopen.html +48 -46
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/logical_count.html +78 -75
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/logical_parameters.html +276 -0
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/logical_range_filter.html +158 -85
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/logical_select.html +2008 -80
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/logical_shard_list.html +285 -0
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/logical_table_remove.html +71 -68
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/normalize.html +82 -79
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/normalizer_list.html +52 -50
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/object_exist.html +220 -0
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/plugin_register.html +53 -51
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/plugin_unregister.html +51 -49
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/quit.html +43 -41
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/range_filter.html +43 -41
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/register.html +52 -50
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/request_cancel.html +68 -67
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/ruby_eval.html +59 -57
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/ruby_load.html +59 -57
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/select.html +680 -448
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/shutdown.html +43 -41
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/status.html +48 -46
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/suggest.html +84 -82
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/table_create.html +146 -126
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/table_list.html +55 -53
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/table_remove.html +46 -44
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/table_rename.html +322 -0
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/table_tokenize.html +73 -70
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/thread_limit.html +229 -0
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/tokenize.html +94 -91
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/tokenizer_list.html +56 -51
 - data/vendor/local/share/doc/groonga/ja/html/reference/commands/truncate.html +56 -55
 - data/vendor/local/share/doc/groonga/ja/html/reference/executables.html +24 -22
 - data/vendor/local/share/doc/groonga/ja/html/reference/executables/grndb.html +39 -37
 - data/vendor/local/share/doc/groonga/ja/html/reference/executables/grnslap.html +28 -26
 - data/vendor/local/share/doc/groonga/ja/html/reference/executables/groonga-benchmark.html +33 -31
 - data/vendor/local/share/doc/groonga/ja/html/reference/executables/groonga-httpd.html +73 -72
 - data/vendor/local/share/doc/groonga/ja/html/reference/executables/groonga-server-http.html +29 -27
 - data/vendor/local/share/doc/groonga/ja/html/reference/executables/groonga-suggest-create-dataset.html +23 -21
 - data/vendor/local/share/doc/groonga/ja/html/reference/executables/groonga-suggest-httpd.html +48 -46
 - data/vendor/local/share/doc/groonga/ja/html/reference/executables/groonga-suggest-learner.html +40 -38
 - data/vendor/local/share/doc/groonga/ja/html/reference/executables/groonga.html +84 -82
 - data/vendor/local/share/doc/groonga/ja/html/reference/function.html +22 -20
 - data/vendor/local/share/doc/groonga/ja/html/reference/functions/between.html +46 -44
 - data/vendor/local/share/doc/groonga/ja/html/reference/functions/edit_distance.html +25 -23
 - data/vendor/local/share/doc/groonga/ja/html/reference/functions/geo_distance.html +69 -67
 - data/vendor/local/share/doc/groonga/ja/html/reference/functions/geo_in_circle.html +36 -34
 - data/vendor/local/share/doc/groonga/ja/html/reference/functions/geo_in_rectangle.html +26 -24
 - data/vendor/local/share/doc/groonga/ja/html/reference/functions/highlight_full.html +56 -54
 - data/vendor/local/share/doc/groonga/ja/html/reference/functions/highlight_html.html +45 -43
 - data/vendor/local/share/doc/groonga/ja/html/reference/functions/html_untag.html +34 -32
 - data/vendor/local/share/doc/groonga/ja/html/reference/functions/in_values.html +35 -33
 - data/vendor/local/share/doc/groonga/ja/html/reference/functions/now.html +23 -21
 - data/vendor/local/share/doc/groonga/ja/html/reference/functions/query.html +48 -46
 - data/vendor/local/share/doc/groonga/ja/html/reference/functions/rand.html +24 -22
 - data/vendor/local/share/doc/groonga/ja/html/reference/functions/snippet_html.html +35 -33
 - data/vendor/local/share/doc/groonga/ja/html/reference/functions/sub_filter.html +37 -35
 - data/vendor/local/share/doc/groonga/ja/html/reference/functions/vector_size.html +33 -31
 - data/vendor/local/share/doc/groonga/ja/html/reference/grn_expr.html +26 -24
 - data/vendor/local/share/doc/groonga/ja/html/reference/grn_expr/query_syntax.html +143 -141
 - data/vendor/local/share/doc/groonga/ja/html/reference/grn_expr/script_syntax.html +236 -234
 - data/vendor/local/share/doc/groonga/ja/html/reference/indexing.html +34 -28
 - data/vendor/local/share/doc/groonga/ja/html/reference/log.html +43 -41
 - data/vendor/local/share/doc/groonga/ja/html/reference/normalizers.html +39 -37
 - data/vendor/local/share/doc/groonga/ja/html/reference/operations.html +21 -19
 - data/vendor/local/share/doc/groonga/ja/html/reference/operations/geolocation_search.html +23 -21
 - data/vendor/local/share/doc/groonga/ja/html/reference/output.html +32 -30
 - data/vendor/local/share/doc/groonga/ja/html/reference/query_expanders.html +21 -19
 - data/vendor/local/share/doc/groonga/ja/html/reference/query_expanders/tsv.html +42 -39
 - data/vendor/local/share/doc/groonga/ja/html/reference/regular_expression.html +91 -88
 - data/vendor/local/share/doc/groonga/ja/html/reference/scorer.html +50 -40
 - data/vendor/local/share/doc/groonga/ja/html/reference/scorers/scorer_tf_at_most.html +27 -25
 - data/vendor/local/share/doc/groonga/ja/html/reference/scorers/scorer_tf_idf.html +41 -31
 - data/vendor/local/share/doc/groonga/ja/html/reference/scoring_note.html +19 -17
 - data/vendor/local/share/doc/groonga/ja/html/reference/sharding.html +223 -0
 - data/vendor/local/share/doc/groonga/ja/html/reference/suggest.html +21 -19
 - data/vendor/local/share/doc/groonga/ja/html/reference/suggest/completion.html +27 -25
 - data/vendor/local/share/doc/groonga/ja/html/reference/suggest/correction.html +23 -21
 - data/vendor/local/share/doc/groonga/ja/html/reference/suggest/introduction.html +23 -21
 - data/vendor/local/share/doc/groonga/ja/html/reference/suggest/suggestion.html +23 -21
 - data/vendor/local/share/doc/groonga/ja/html/reference/tables.html +46 -44
 - data/vendor/local/share/doc/groonga/ja/html/reference/token_filters.html +38 -36
 - data/vendor/local/share/doc/groonga/ja/html/reference/tokenizers.html +303 -243
 - data/vendor/local/share/doc/groonga/ja/html/reference/tuning.html +62 -60
 - data/vendor/local/share/doc/groonga/ja/html/reference/types.html +42 -40
 - data/vendor/local/share/doc/groonga/ja/html/search.html +11 -11
 - data/vendor/local/share/doc/groonga/ja/html/searchindex.js +1 -1
 - data/vendor/local/share/doc/groonga/ja/html/server.html +19 -17
 - data/vendor/local/share/doc/groonga/ja/html/server/gqtp.html +21 -19
 - data/vendor/local/share/doc/groonga/ja/html/server/http.html +21 -19
 - data/vendor/local/share/doc/groonga/ja/html/server/http/comparison.html +31 -29
 - data/vendor/local/share/doc/groonga/ja/html/server/http/groonga-httpd.html +23 -21
 - data/vendor/local/share/doc/groonga/ja/html/server/http/groonga.html +23 -21
 - data/vendor/local/share/doc/groonga/ja/html/server/memcached.html +21 -19
 - data/vendor/local/share/doc/groonga/ja/html/server/package.html +38 -36
 - data/vendor/local/share/doc/groonga/ja/html/spec.html +23 -21
 - data/vendor/local/share/doc/groonga/ja/html/spec/gqtp.html +128 -126
 - data/vendor/local/share/doc/groonga/ja/html/spec/search.html +22 -20
 - data/vendor/local/share/doc/groonga/ja/html/troubleshooting.html +19 -17
 - data/vendor/local/share/doc/groonga/ja/html/troubleshooting/different_results_with_the_same_keyword.html +25 -23
 - data/vendor/local/share/doc/groonga/ja/html/troubleshooting/mmap_cannot_allocate_memory.html +22 -20
 - data/vendor/local/share/doc/groonga/ja/html/tutorial.html +20 -18
 - data/vendor/local/share/doc/groonga/ja/html/tutorial/data.html +29 -23
 - data/vendor/local/share/doc/groonga/ja/html/tutorial/drilldown.html +34 -32
 - data/vendor/local/share/doc/groonga/ja/html/tutorial/index.html +22 -20
 - data/vendor/local/share/doc/groonga/ja/html/tutorial/introduction.html +46 -39
 - data/vendor/local/share/doc/groonga/ja/html/tutorial/lexicon.html +21 -19
 - data/vendor/local/share/doc/groonga/ja/html/tutorial/match_columns.html +66 -64
 - data/vendor/local/share/doc/groonga/ja/html/tutorial/micro_blog.html +52 -50
 - data/vendor/local/share/doc/groonga/ja/html/tutorial/network.html +25 -23
 - data/vendor/local/share/doc/groonga/ja/html/tutorial/patricia_trie.html +22 -20
 - data/vendor/local/share/doc/groonga/ja/html/tutorial/query_expansion.html +25 -23
 - data/vendor/local/share/doc/groonga/ja/html/tutorial/search.html +35 -33
 - data/vendor/local/share/license/groonga/README.md +6 -0
 - data/vendor/local/share/license/mruby/AUTHORS +1 -0
 - data/vendor/local/share/license/mruby/MITL +1 -1
 - data/vendor/local/share/license/mruby/README.md +6 -5
 - data/vendor/local/share/license/msgpack/README +219 -0
 - data/vendor/local/share/man/ja/man1/groonga.1 +23512 -15126
 - data/vendor/local/share/man/man1/groonga.1 +26542 -17745
 - metadata +77 -3
 - data/vendor/local/share/license/msgpack/AUTHORS +0 -0
 
| 
         @@ -7,7 +7,7 @@ 
     | 
|
| 
       7 
7 
     | 
    
         
             
              <head>
         
     | 
| 
       8 
8 
     | 
    
         
             
                <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
         
     | 
| 
       9 
9 
     | 
    
         | 
| 
       10 
     | 
    
         
            -
                <title>7.9. Token filters — Groonga v5.0. 
     | 
| 
      
 10 
     | 
    
         
            +
                <title>7.9. Token filters — Groonga v5.0.6-226-gd7da7e7 documentation</title>
         
     | 
| 
       11 
11 
     | 
    
         | 
| 
       12 
12 
     | 
    
         
             
                <link rel="stylesheet" href="../_static/groonga.css" type="text/css" />
         
     | 
| 
       13 
13 
     | 
    
         
             
                <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
         
     | 
| 
         @@ -15,7 +15,7 @@ 
     | 
|
| 
       15 
15 
     | 
    
         
             
                <script type="text/javascript">
         
     | 
| 
       16 
16 
     | 
    
         
             
                  var DOCUMENTATION_OPTIONS = {
         
     | 
| 
       17 
17 
     | 
    
         
             
                    URL_ROOT:    '../',
         
     | 
| 
       18 
     | 
    
         
            -
                    VERSION:     '5.0. 
     | 
| 
      
 18 
     | 
    
         
            +
                    VERSION:     '5.0.6-226-gd7da7e7',
         
     | 
| 
       19 
19 
     | 
    
         
             
                    COLLAPSE_INDEX: false,
         
     | 
| 
       20 
20 
     | 
    
         
             
                    FILE_SUFFIX: '.html',
         
     | 
| 
       21 
21 
     | 
    
         
             
                    HAS_SOURCE:  true
         
     | 
| 
         @@ -25,12 +25,12 @@ 
     | 
|
| 
       25 
25 
     | 
    
         
             
                <script type="text/javascript" src="../_static/underscore.js"></script>
         
     | 
| 
       26 
26 
     | 
    
         
             
                <script type="text/javascript" src="../_static/doctools.js"></script>
         
     | 
| 
       27 
27 
     | 
    
         
             
                <link rel="shortcut icon" href="../_static/favicon.ico"/>
         
     | 
| 
       28 
     | 
    
         
            -
                <link rel="top" title="Groonga v5.0. 
     | 
| 
      
 28 
     | 
    
         
            +
                <link rel="top" title="Groonga v5.0.6-226-gd7da7e7 documentation" href="../index.html" />
         
     | 
| 
       29 
29 
     | 
    
         
             
                <link rel="up" title="7. Reference manual" href="../reference.html" />
         
     | 
| 
       30 
30 
     | 
    
         
             
                <link rel="next" title="7.10. Query expanders" href="query_expanders.html" />
         
     | 
| 
       31 
31 
     | 
    
         
             
                <link rel="prev" title="7.8. Tokenizers" href="tokenizers.html" /> 
         
     | 
| 
       32 
32 
     | 
    
         
             
              </head>
         
     | 
| 
       33 
     | 
    
         
            -
              <body>
         
     | 
| 
      
 33 
     | 
    
         
            +
              <body role="document">
         
     | 
| 
       34 
34 
     | 
    
         
             
            <div class="header">
         
     | 
| 
       35 
35 
     | 
    
         
             
              <h1 class="title">
         
     | 
| 
       36 
36 
     | 
    
         
             
                <a id="top-link" href="../index.html">
         
     | 
| 
         @@ -48,7 +48,7 @@ 
     | 
|
| 
       48 
48 
     | 
    
         
             
            </div>
         
     | 
| 
       49 
49 
     | 
    
         | 
| 
       50 
50 
     | 
    
         | 
| 
       51 
     | 
    
         
            -
                <div class="related">
         
     | 
| 
      
 51 
     | 
    
         
            +
                <div class="related" role="navigation" aria-label="related navigation">
         
     | 
| 
       52 
52 
     | 
    
         
             
                  <h3>Navigation</h3>
         
     | 
| 
       53 
53 
     | 
    
         
             
                  <ul>
         
     | 
| 
       54 
54 
     | 
    
         
             
                    <li class="right" style="margin-right: 10px">
         
     | 
| 
         @@ -60,15 +60,15 @@ 
     | 
|
| 
       60 
60 
     | 
    
         
             
                    <li class="right" >
         
     | 
| 
       61 
61 
     | 
    
         
             
                      <a href="tokenizers.html" title="7.8. Tokenizers"
         
     | 
| 
       62 
62 
     | 
    
         
             
                         accesskey="P">previous</a> |</li>
         
     | 
| 
       63 
     | 
    
         
            -
                    <li><a href="../index.html">Groonga v5.0. 
     | 
| 
       64 
     | 
    
         
            -
                      <li><a href="../reference.html" accesskey="U">7. Reference manual</a> »</li> 
         
     | 
| 
      
 63 
     | 
    
         
            +
                    <li class="nav-item nav-item-0"><a href="../index.html">Groonga v5.0.6-226-gd7da7e7 documentation</a> »</li>
         
     | 
| 
      
 64 
     | 
    
         
            +
                      <li class="nav-item nav-item-1"><a href="../reference.html" accesskey="U">7. Reference manual</a> »</li> 
         
     | 
| 
       65 
65 
     | 
    
         
             
                  </ul>
         
     | 
| 
       66 
66 
     | 
    
         
             
                </div>  
         
     | 
| 
       67 
67 
     | 
    
         | 
| 
       68 
68 
     | 
    
         
             
                <div class="document">
         
     | 
| 
       69 
69 
     | 
    
         
             
                  <div class="documentwrapper">
         
     | 
| 
       70 
70 
     | 
    
         
             
                    <div class="bodywrapper">
         
     | 
| 
       71 
     | 
    
         
            -
                      <div class="body">
         
     | 
| 
      
 71 
     | 
    
         
            +
                      <div class="body" role="main">
         
     | 
| 
       72 
72 
     | 
    
         | 
| 
       73 
73 
     | 
    
         
             
              <div class="section" id="token-filters">
         
     | 
| 
       74 
74 
     | 
    
         
             
            <h1>7.9. Token filters<a class="headerlink" href="#token-filters" title="Permalink to this headline">¶</a></h1>
         
     | 
| 
         @@ -78,9 +78,9 @@ 
     | 
|
| 
       78 
78 
     | 
    
         
             
            <p>Token filter module can be added as a plugin.</p>
         
     | 
| 
       79 
79 
     | 
    
         
             
            <p>You can customize tokenized token by registering your token filters plugins to Groonga.</p>
         
     | 
| 
       80 
80 
     | 
    
         
             
            <p>A table can have zero or more token filters. You can attach token
         
     | 
| 
       81 
     | 
    
         
            -
            filters to a table by <a class="reference internal" href="commands/table_create.html#table-create-token-filters">< 
     | 
| 
      
 81 
     | 
    
         
            +
            filters to a table by <a class="reference internal" href="commands/table_create.html#table-create-token-filters"><span>token_filters</span></a> option in
         
     | 
| 
       82 
82 
     | 
    
         
             
            <a class="reference internal" href="commands/table_create.html"><em>table_create</em></a>.</p>
         
     | 
| 
       83 
     | 
    
         
            -
            <p>Here is an example < 
     | 
| 
      
 83 
     | 
    
         
            +
            <p>Here is an example <code class="docutils literal"><span class="pre">table_create</span></code> that uses <code class="docutils literal"><span class="pre">TokenFilterStopWord</span></code>
         
     | 
| 
       84 
84 
     | 
    
         
             
            token filter module:</p>
         
     | 
| 
       85 
85 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       86 
86 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>register token_filters/stop_word
         
     | 
| 
         @@ -97,17 +97,17 @@ table_create Terms TABLE_PAT_KEY ShortText \ 
     | 
|
| 
       97 
97 
     | 
    
         
             
            <h2>7.9.2. Available token filters<a class="headerlink" href="#available-token-filters" title="Permalink to this headline">¶</a></h2>
         
     | 
| 
       98 
98 
     | 
    
         
             
            <p>Here is the list of available token filters:</p>
         
     | 
| 
       99 
99 
     | 
    
         
             
            <ul class="simple">
         
     | 
| 
       100 
     | 
    
         
            -
            <li>< 
     | 
| 
       101 
     | 
    
         
            -
            <li>< 
     | 
| 
      
 100 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenFilterStopWord</span></code></li>
         
     | 
| 
      
 101 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenFilterStem</span></code></li>
         
     | 
| 
       102 
102 
     | 
    
         
             
            </ul>
         
     | 
| 
       103 
103 
     | 
    
         
             
            <div class="section" id="tokenfilterstopword">
         
     | 
| 
       104 
     | 
    
         
            -
            <span id="token-filter-stop-word"></span><h3>7.9.2.1. < 
     | 
| 
       105 
     | 
    
         
            -
            <p>< 
     | 
| 
      
 104 
     | 
    
         
            +
            <span id="token-filter-stop-word"></span><h3>7.9.2.1. <code class="docutils literal"><span class="pre">TokenFilterStopWord</span></code><a class="headerlink" href="#tokenfilterstopword" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
      
 105 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenFilterStopWord</span></code> removes stop words from tokenized token
         
     | 
| 
       106 
106 
     | 
    
         
             
            in searching the documents.</p>
         
     | 
| 
       107 
     | 
    
         
            -
            <p>< 
     | 
| 
      
 107 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenFilterStopWord</span></code> can specify stop word after adding the
         
     | 
| 
       108 
108 
     | 
    
         
             
            documents because it removes token in searching the documents.</p>
         
     | 
| 
       109 
     | 
    
         
            -
            <p>The stop word is specified < 
     | 
| 
       110 
     | 
    
         
            -
            <p>Here is an example that uses < 
     | 
| 
      
 109 
     | 
    
         
            +
            <p>The stop word is specified <code class="docutils literal"><span class="pre">is_stop_word</span></code> column on lexicon table.</p>
         
     | 
| 
      
 110 
     | 
    
         
            +
            <p>Here is an example that uses <code class="docutils literal"><span class="pre">TokenFilterStopWord</span></code> token filter:</p>
         
     | 
| 
       111 
111 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       112 
112 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>register token_filters/stop_word
         
     | 
| 
       113 
113 
     | 
    
         
             
            # [[0, 1337566253.89858, 0.000355720520019531], true]
         
     | 
| 
         @@ -171,14 +171,14 @@ select Memos --match_columns content --query "Hello and" 
     | 
|
| 
       171 
171 
     | 
    
         
             
            # ]
         
     | 
| 
       172 
172 
     | 
    
         
             
            </pre></div>
         
     | 
| 
       173 
173 
     | 
    
         
             
            </div>
         
     | 
| 
       174 
     | 
    
         
            -
            <p>< 
     | 
| 
       175 
     | 
    
         
            -
            <p>< 
     | 
| 
       176 
     | 
    
         
            -
            < 
     | 
| 
      
 174 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">and</span></code> token is marked as stop word in <code class="docutils literal"><span class="pre">Terms</span></code> table.</p>
         
     | 
| 
      
 175 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">"Hello"</span></code> that doesn't have <code class="docutils literal"><span class="pre">and</span></code> in content is matched. Because
         
     | 
| 
      
 176 
     | 
    
         
            +
            <code class="docutils literal"><span class="pre">and</span></code> is a stop word and <code class="docutils literal"><span class="pre">and</span></code> is removed from query.</p>
         
     | 
| 
       177 
177 
     | 
    
         
             
            </div>
         
     | 
| 
       178 
178 
     | 
    
         
             
            <div class="section" id="tokenfilterstem">
         
     | 
| 
       179 
     | 
    
         
            -
            <span id="token-filter-stem"></span><h3>7.9.2.2. < 
     | 
| 
       180 
     | 
    
         
            -
            <p>< 
     | 
| 
       181 
     | 
    
         
            -
            <p>Here is an example that uses < 
     | 
| 
      
 179 
     | 
    
         
            +
            <span id="token-filter-stem"></span><h3>7.9.2.2. <code class="docutils literal"><span class="pre">TokenFilterStem</span></code><a class="headerlink" href="#tokenfilterstem" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
      
 180 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenFilterStem</span></code> stems tokenized token.</p>
         
     | 
| 
      
 181 
     | 
    
         
            +
            <p>Here is an example that uses <code class="docutils literal"><span class="pre">TokenFilterStem</span></code> token filter:</p>
         
     | 
| 
       182 
182 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       183 
183 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>register token_filters/stem
         
     | 
| 
       184 
184 
     | 
    
         
             
            # [[0, 1337566253.89858, 0.000355720520019531], true]
         
     | 
| 
         @@ -239,9 +239,9 @@ select Memos --match_columns content --query "develops" 
     | 
|
| 
       239 
239 
     | 
    
         
             
            # ]
         
     | 
| 
       240 
240 
     | 
    
         
             
            </pre></div>
         
     | 
| 
       241 
241 
     | 
    
         
             
            </div>
         
     | 
| 
       242 
     | 
    
         
            -
            <p>All of < 
     | 
| 
       243 
     | 
    
         
            -
            tokens are stemmed as < 
     | 
| 
       244 
     | 
    
         
            -
            < 
     | 
| 
      
 242 
     | 
    
         
            +
            <p>All of <code class="docutils literal"><span class="pre">develop</span></code>, <code class="docutils literal"><span class="pre">developing</span></code>, <code class="docutils literal"><span class="pre">developed</span></code> and <code class="docutils literal"><span class="pre">develops</span></code>
         
     | 
| 
      
 243 
     | 
    
         
            +
            tokens are stemmed as <code class="docutils literal"><span class="pre">develop</span></code>. So we can find <code class="docutils literal"><span class="pre">develop</span></code>,
         
     | 
| 
      
 244 
     | 
    
         
            +
            <code class="docutils literal"><span class="pre">developing</span></code> and <code class="docutils literal"><span class="pre">developed</span></code> by <code class="docutils literal"><span class="pre">develops</span></code> query.</p>
         
     | 
| 
       245 
245 
     | 
    
         
             
            </div>
         
     | 
| 
       246 
246 
     | 
    
         
             
            </div>
         
     | 
| 
       247 
247 
     | 
    
         
             
            <div class="section" id="see-also">
         
     | 
| 
         @@ -256,15 +256,15 @@ tokens are stemmed as <tt class="docutils literal"><span class="pre">develop</sp 
     | 
|
| 
       256 
256 
     | 
    
         
             
                      </div>
         
     | 
| 
       257 
257 
     | 
    
         
             
                    </div>
         
     | 
| 
       258 
258 
     | 
    
         
             
                  </div>
         
     | 
| 
       259 
     | 
    
         
            -
                  <div class="sphinxsidebar">
         
     | 
| 
      
 259 
     | 
    
         
            +
                  <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
         
     | 
| 
       260 
260 
     | 
    
         
             
                    <div class="sphinxsidebarwrapper">
         
     | 
| 
       261 
261 
     | 
    
         
             
              <h3><a href="../index.html">Table Of Contents</a></h3>
         
     | 
| 
       262 
262 
     | 
    
         
             
              <ul>
         
     | 
| 
       263 
263 
     | 
    
         
             
            <li><a class="reference internal" href="#">7.9. Token filters</a><ul>
         
     | 
| 
       264 
264 
     | 
    
         
             
            <li><a class="reference internal" href="#summary">7.9.1. Summary</a></li>
         
     | 
| 
       265 
265 
     | 
    
         
             
            <li><a class="reference internal" href="#available-token-filters">7.9.2. Available token filters</a><ul>
         
     | 
| 
       266 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokenfilterstopword">7.9.2.1. < 
     | 
| 
       267 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokenfilterstem">7.9.2.2. < 
     | 
| 
      
 266 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokenfilterstopword">7.9.2.1. <code class="docutils literal"><span class="pre">TokenFilterStopWord</span></code></a></li>
         
     | 
| 
      
 267 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokenfilterstem">7.9.2.2. <code class="docutils literal"><span class="pre">TokenFilterStem</span></code></a></li>
         
     | 
| 
       268 
268 
     | 
    
         
             
            </ul>
         
     | 
| 
       269 
269 
     | 
    
         
             
            </li>
         
     | 
| 
       270 
270 
     | 
    
         
             
            <li><a class="reference internal" href="#see-also">7.9.3. See also</a></li>
         
     | 
| 
         @@ -278,12 +278,14 @@ tokens are stemmed as <tt class="docutils literal"><span class="pre">develop</sp 
     | 
|
| 
       278 
278 
     | 
    
         
             
              <h4>Next topic</h4>
         
     | 
| 
       279 
279 
     | 
    
         
             
              <p class="topless"><a href="query_expanders.html"
         
     | 
| 
       280 
280 
     | 
    
         
             
                                    title="next chapter">7.10. Query expanders</a></p>
         
     | 
| 
       281 
     | 
    
         
            -
              < 
     | 
| 
       282 
     | 
    
         
            -
             
     | 
| 
       283 
     | 
    
         
            -
                < 
     | 
| 
       284 
     | 
    
         
            -
             
     | 
| 
       285 
     | 
    
         
            -
             
     | 
| 
       286 
     | 
    
         
            -
             
     | 
| 
      
 281 
     | 
    
         
            +
              <div role="note" aria-label="source link">
         
     | 
| 
      
 282 
     | 
    
         
            +
                <h3>This Page</h3>
         
     | 
| 
      
 283 
     | 
    
         
            +
                <ul class="this-page-menu">
         
     | 
| 
      
 284 
     | 
    
         
            +
                  <li><a href="../_sources/reference/token_filters.txt"
         
     | 
| 
      
 285 
     | 
    
         
            +
                        rel="nofollow">Show Source</a></li>
         
     | 
| 
      
 286 
     | 
    
         
            +
                </ul>
         
     | 
| 
      
 287 
     | 
    
         
            +
               </div>
         
     | 
| 
      
 288 
     | 
    
         
            +
            <div id="searchbox" style="display: none" role="search">
         
     | 
| 
       287 
289 
     | 
    
         
             
              <h3>Quick search</h3>
         
     | 
| 
       288 
290 
     | 
    
         
             
                <form class="search" action="../search.html" method="get">
         
     | 
| 
       289 
291 
     | 
    
         
             
                  <input type="text" name="q" />
         
     | 
| 
         @@ -300,7 +302,7 @@ tokens are stemmed as <tt class="docutils literal"><span class="pre">develop</sp 
     | 
|
| 
       300 
302 
     | 
    
         
             
                  </div>
         
     | 
| 
       301 
303 
     | 
    
         
             
                  <div class="clearer"></div>
         
     | 
| 
       302 
304 
     | 
    
         
             
                </div>
         
     | 
| 
       303 
     | 
    
         
            -
                <div class="related">
         
     | 
| 
      
 305 
     | 
    
         
            +
                <div class="related" role="navigation" aria-label="related navigation">
         
     | 
| 
       304 
306 
     | 
    
         
             
                  <h3>Navigation</h3>
         
     | 
| 
       305 
307 
     | 
    
         
             
                  <ul>
         
     | 
| 
       306 
308 
     | 
    
         
             
                    <li class="right" style="margin-right: 10px">
         
     | 
| 
         @@ -312,11 +314,11 @@ tokens are stemmed as <tt class="docutils literal"><span class="pre">develop</sp 
     | 
|
| 
       312 
314 
     | 
    
         
             
                    <li class="right" >
         
     | 
| 
       313 
315 
     | 
    
         
             
                      <a href="tokenizers.html" title="7.8. Tokenizers"
         
     | 
| 
       314 
316 
     | 
    
         
             
                         >previous</a> |</li>
         
     | 
| 
       315 
     | 
    
         
            -
                    <li><a href="../index.html">Groonga v5.0. 
     | 
| 
       316 
     | 
    
         
            -
                      <li><a href="../reference.html" >7. Reference manual</a> »</li> 
         
     | 
| 
      
 317 
     | 
    
         
            +
                    <li class="nav-item nav-item-0"><a href="../index.html">Groonga v5.0.6-226-gd7da7e7 documentation</a> »</li>
         
     | 
| 
      
 318 
     | 
    
         
            +
                      <li class="nav-item nav-item-1"><a href="../reference.html" >7. Reference manual</a> »</li> 
         
     | 
| 
       317 
319 
     | 
    
         
             
                  </ul>
         
     | 
| 
       318 
320 
     | 
    
         
             
                </div>
         
     | 
| 
       319 
     | 
    
         
            -
                <div class="footer">
         
     | 
| 
      
 321 
     | 
    
         
            +
                <div class="footer" role="contentinfo">
         
     | 
| 
       320 
322 
     | 
    
         
             
                    © Copyright 2009-2015, Brazil, Inc.
         
     | 
| 
       321 
323 
     | 
    
         
             
                </div>
         
     | 
| 
       322 
324 
     | 
    
         
             
              </body>
         
     | 
| 
         @@ -7,7 +7,7 @@ 
     | 
|
| 
       7 
7 
     | 
    
         
             
              <head>
         
     | 
| 
       8 
8 
     | 
    
         
             
                <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
         
     | 
| 
       9 
9 
     | 
    
         | 
| 
       10 
     | 
    
         
            -
                <title>7.8. Tokenizers — Groonga v5.0. 
     | 
| 
      
 10 
     | 
    
         
            +
                <title>7.8. Tokenizers — Groonga v5.0.6-226-gd7da7e7 documentation</title>
         
     | 
| 
       11 
11 
     | 
    
         | 
| 
       12 
12 
     | 
    
         
             
                <link rel="stylesheet" href="../_static/groonga.css" type="text/css" />
         
     | 
| 
       13 
13 
     | 
    
         
             
                <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
         
     | 
| 
         @@ -15,7 +15,7 @@ 
     | 
|
| 
       15 
15 
     | 
    
         
             
                <script type="text/javascript">
         
     | 
| 
       16 
16 
     | 
    
         
             
                  var DOCUMENTATION_OPTIONS = {
         
     | 
| 
       17 
17 
     | 
    
         
             
                    URL_ROOT:    '../',
         
     | 
| 
       18 
     | 
    
         
            -
                    VERSION:     '5.0. 
     | 
| 
      
 18 
     | 
    
         
            +
                    VERSION:     '5.0.6-226-gd7da7e7',
         
     | 
| 
       19 
19 
     | 
    
         
             
                    COLLAPSE_INDEX: false,
         
     | 
| 
       20 
20 
     | 
    
         
             
                    FILE_SUFFIX: '.html',
         
     | 
| 
       21 
21 
     | 
    
         
             
                    HAS_SOURCE:  true
         
     | 
| 
         @@ -25,12 +25,12 @@ 
     | 
|
| 
       25 
25 
     | 
    
         
             
                <script type="text/javascript" src="../_static/underscore.js"></script>
         
     | 
| 
       26 
26 
     | 
    
         
             
                <script type="text/javascript" src="../_static/doctools.js"></script>
         
     | 
| 
       27 
27 
     | 
    
         
             
                <link rel="shortcut icon" href="../_static/favicon.ico"/>
         
     | 
| 
       28 
     | 
    
         
            -
                <link rel="top" title="Groonga v5.0. 
     | 
| 
      
 28 
     | 
    
         
            +
                <link rel="top" title="Groonga v5.0.6-226-gd7da7e7 documentation" href="../index.html" />
         
     | 
| 
       29 
29 
     | 
    
         
             
                <link rel="up" title="7. Reference manual" href="../reference.html" />
         
     | 
| 
       30 
30 
     | 
    
         
             
                <link rel="next" title="7.9. Token filters" href="token_filters.html" />
         
     | 
| 
       31 
31 
     | 
    
         
             
                <link rel="prev" title="7.7. Normalizers" href="normalizers.html" /> 
         
     | 
| 
       32 
32 
     | 
    
         
             
              </head>
         
     | 
| 
       33 
     | 
    
         
            -
              <body>
         
     | 
| 
      
 33 
     | 
    
         
            +
              <body role="document">
         
     | 
| 
       34 
34 
     | 
    
         
             
            <div class="header">
         
     | 
| 
       35 
35 
     | 
    
         
             
              <h1 class="title">
         
     | 
| 
       36 
36 
     | 
    
         
             
                <a id="top-link" href="../index.html">
         
     | 
| 
         @@ -48,7 +48,7 @@ 
     | 
|
| 
       48 
48 
     | 
    
         
             
            </div>
         
     | 
| 
       49 
49 
     | 
    
         | 
| 
       50 
50 
     | 
    
         | 
| 
       51 
     | 
    
         
            -
                <div class="related">
         
     | 
| 
      
 51 
     | 
    
         
            +
                <div class="related" role="navigation" aria-label="related navigation">
         
     | 
| 
       52 
52 
     | 
    
         
             
                  <h3>Navigation</h3>
         
     | 
| 
       53 
53 
     | 
    
         
             
                  <ul>
         
     | 
| 
       54 
54 
     | 
    
         
             
                    <li class="right" style="margin-right: 10px">
         
     | 
| 
         @@ -60,15 +60,15 @@ 
     | 
|
| 
       60 
60 
     | 
    
         
             
                    <li class="right" >
         
     | 
| 
       61 
61 
     | 
    
         
             
                      <a href="normalizers.html" title="7.7. Normalizers"
         
     | 
| 
       62 
62 
     | 
    
         
             
                         accesskey="P">previous</a> |</li>
         
     | 
| 
       63 
     | 
    
         
            -
                    <li><a href="../index.html">Groonga v5.0. 
     | 
| 
       64 
     | 
    
         
            -
                      <li><a href="../reference.html" accesskey="U">7. Reference manual</a> »</li> 
         
     | 
| 
      
 63 
     | 
    
         
            +
                    <li class="nav-item nav-item-0"><a href="../index.html">Groonga v5.0.6-226-gd7da7e7 documentation</a> »</li>
         
     | 
| 
      
 64 
     | 
    
         
            +
                      <li class="nav-item nav-item-1"><a href="../reference.html" accesskey="U">7. Reference manual</a> »</li> 
         
     | 
| 
       65 
65 
     | 
    
         
             
                  </ul>
         
     | 
| 
       66 
66 
     | 
    
         
             
                </div>  
         
     | 
| 
       67 
67 
     | 
    
         | 
| 
       68 
68 
     | 
    
         
             
                <div class="document">
         
     | 
| 
       69 
69 
     | 
    
         
             
                  <div class="documentwrapper">
         
     | 
| 
       70 
70 
     | 
    
         
             
                    <div class="bodywrapper">
         
     | 
| 
       71 
     | 
    
         
            -
                      <div class="body">
         
     | 
| 
      
 71 
     | 
    
         
            +
                      <div class="body" role="main">
         
     | 
| 
       72 
72 
     | 
    
         | 
| 
       73 
73 
     | 
    
         
             
              <div class="section" id="tokenizers">
         
     | 
| 
       74 
74 
     | 
    
         
             
            <h1>7.8. Tokenizers<a class="headerlink" href="#tokenizers" title="Permalink to this headline">¶</a></h1>
         
     | 
| 
         @@ -79,15 +79,15 @@ the following cases:</p> 
     | 
|
| 
       79 
79 
     | 
    
         
             
            <blockquote>
         
     | 
| 
       80 
80 
     | 
    
         
             
            <div><ul>
         
     | 
| 
       81 
81 
     | 
    
         
             
            <li><p class="first">Indexing text</p>
         
     | 
| 
       82 
     | 
    
         
            -
            <div class="figure align-center">
         
     | 
| 
      
 82 
     | 
    
         
            +
            <div class="figure align-center" id="id1">
         
     | 
| 
       83 
83 
     | 
    
         
             
            <a class="reference internal image-reference" href="../_images/used-when-indexing.png"><img alt="../_images/used-when-indexing.png" src="../_images/used-when-indexing.png" style="width: 80%;" /></a>
         
     | 
| 
       84 
     | 
    
         
            -
            <p class="caption">Tokenizer is used when indexing text.</p>
         
     | 
| 
      
 84 
     | 
    
         
            +
            <p class="caption"><span class="caption-text">Tokenizer is used when indexing text.</span></p>
         
     | 
| 
       85 
85 
     | 
    
         
             
            </div>
         
     | 
| 
       86 
86 
     | 
    
         
             
            </li>
         
     | 
| 
       87 
87 
     | 
    
         
             
            <li><p class="first">Searching by query</p>
         
     | 
| 
       88 
     | 
    
         
            -
            <div class="figure align-center">
         
     | 
| 
      
 88 
     | 
    
         
            +
            <div class="figure align-center" id="id2">
         
     | 
| 
       89 
89 
     | 
    
         
             
            <a class="reference internal image-reference" href="../_images/used-when-searching.png"><img alt="../_images/used-when-searching.png" src="../_images/used-when-searching.png" style="width: 80%;" /></a>
         
     | 
| 
       90 
     | 
    
         
            -
            <p class="caption">Tokenizer is used when searching by query.</p>
         
     | 
| 
      
 90 
     | 
    
         
            +
            <p class="caption"><span class="caption-text">Tokenizer is used when searching by query.</span></p>
         
     | 
| 
       91 
91 
     | 
    
         
             
            </div>
         
     | 
| 
       92 
92 
     | 
    
         
             
            </li>
         
     | 
| 
       93 
93 
     | 
    
         
             
            </ul>
         
     | 
| 
         @@ -95,12 +95,12 @@ the following cases:</p> 
     | 
|
| 
       95 
95 
     | 
    
         
             
            <p>Tokenizer is an important module for full-text search. You can change
         
     | 
| 
       96 
96 
     | 
    
         
             
            trade-off between <a class="reference external" href="http://en.wikipedia.org/wiki/Precision_and_recall">precision and recall</a> by changing
         
     | 
| 
       97 
97 
     | 
    
         
             
            tokenizer.</p>
         
     | 
| 
       98 
     | 
    
         
            -
            <p>Normally, <a class="reference internal" href="#token-bigram">< 
     | 
| 
      
 98 
     | 
    
         
            +
            <p>Normally, <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a> is a suitable tokenizer. If you don't
         
     | 
| 
       99 
99 
     | 
    
         
             
            know much about tokenizer, it's recommended that you choose
         
     | 
| 
       100 
     | 
    
         
            -
            <a class="reference internal" href="#token-bigram">< 
     | 
| 
      
 100 
     | 
    
         
            +
            <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a>.</p>
         
     | 
| 
       101 
101 
     | 
    
         
             
            <p>You can try a tokenizer by <a class="reference internal" href="commands/tokenize.html"><em>tokenize</em></a> and
         
     | 
| 
       102 
102 
     | 
    
         
             
            <a class="reference internal" href="commands/table_tokenize.html"><em>table_tokenize</em></a>. Here is an example to
         
     | 
| 
       103 
     | 
    
         
            -
            try <a class="reference internal" href="#token-bigram">< 
     | 
| 
      
 103 
     | 
    
         
            +
            try <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a> tokenizer by
         
     | 
| 
       104 
104 
     | 
    
         
             
            <a class="reference internal" href="commands/tokenize.html"><em>tokenize</em></a>:</p>
         
     | 
| 
       105 
105 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       106 
106 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenBigram "Hello World"
         
     | 
| 
         @@ -113,46 +113,57 @@ try <a class="reference internal" href="#token-bigram"><em>TokenBigram</em></a> 
     | 
|
| 
       113 
113 
     | 
    
         
             
            #   [
         
     | 
| 
       114 
114 
     | 
    
         
             
            #     {
         
     | 
| 
       115 
115 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 116 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       116 
117 
     | 
    
         
             
            #       "value": "He"
         
     | 
| 
       117 
118 
     | 
    
         
             
            #     },
         
     | 
| 
       118 
119 
     | 
    
         
             
            #     {
         
     | 
| 
       119 
120 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 121 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       120 
122 
     | 
    
         
             
            #       "value": "el"
         
     | 
| 
       121 
123 
     | 
    
         
             
            #     },
         
     | 
| 
       122 
124 
     | 
    
         
             
            #     {
         
     | 
| 
       123 
125 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 126 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       124 
127 
     | 
    
         
             
            #       "value": "ll"
         
     | 
| 
       125 
128 
     | 
    
         
             
            #     },
         
     | 
| 
       126 
129 
     | 
    
         
             
            #     {
         
     | 
| 
       127 
130 
     | 
    
         
             
            #       "position": 3,
         
     | 
| 
      
 131 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       128 
132 
     | 
    
         
             
            #       "value": "lo"
         
     | 
| 
       129 
133 
     | 
    
         
             
            #     },
         
     | 
| 
       130 
134 
     | 
    
         
             
            #     {
         
     | 
| 
       131 
135 
     | 
    
         
             
            #       "position": 4,
         
     | 
| 
      
 136 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       132 
137 
     | 
    
         
             
            #       "value": "o "
         
     | 
| 
       133 
138 
     | 
    
         
             
            #     },
         
     | 
| 
       134 
139 
     | 
    
         
             
            #     {
         
     | 
| 
       135 
140 
     | 
    
         
             
            #       "position": 5,
         
     | 
| 
      
 141 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       136 
142 
     | 
    
         
             
            #       "value": " W"
         
     | 
| 
       137 
143 
     | 
    
         
             
            #     },
         
     | 
| 
       138 
144 
     | 
    
         
             
            #     {
         
     | 
| 
       139 
145 
     | 
    
         
             
            #       "position": 6,
         
     | 
| 
      
 146 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       140 
147 
     | 
    
         
             
            #       "value": "Wo"
         
     | 
| 
       141 
148 
     | 
    
         
             
            #     },
         
     | 
| 
       142 
149 
     | 
    
         
             
            #     {
         
     | 
| 
       143 
150 
     | 
    
         
             
            #       "position": 7,
         
     | 
| 
      
 151 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       144 
152 
     | 
    
         
             
            #       "value": "or"
         
     | 
| 
       145 
153 
     | 
    
         
             
            #     },
         
     | 
| 
       146 
154 
     | 
    
         
             
            #     {
         
     | 
| 
       147 
155 
     | 
    
         
             
            #       "position": 8,
         
     | 
| 
      
 156 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       148 
157 
     | 
    
         
             
            #       "value": "rl"
         
     | 
| 
       149 
158 
     | 
    
         
             
            #     },
         
     | 
| 
       150 
159 
     | 
    
         
             
            #     {
         
     | 
| 
       151 
160 
     | 
    
         
             
            #       "position": 9,
         
     | 
| 
      
 161 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       152 
162 
     | 
    
         
             
            #       "value": "ld"
         
     | 
| 
       153 
163 
     | 
    
         
             
            #     },
         
     | 
| 
       154 
164 
     | 
    
         
             
            #     {
         
     | 
| 
       155 
165 
     | 
    
         
             
            #       "position": 10,
         
     | 
| 
      
 166 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       156 
167 
     | 
    
         
             
            #       "value": "d"
         
     | 
| 
       157 
168 
     | 
    
         
             
            #     }
         
     | 
| 
       158 
169 
     | 
    
         
             
            #   ]
         
     | 
| 
         @@ -164,86 +175,86 @@ try <a class="reference internal" href="#token-bigram"><em>TokenBigram</em></a> 
     | 
|
| 
       164 
175 
     | 
    
         
             
            <h2>7.8.2. What is "tokenize"?<a class="headerlink" href="#what-is-tokenize" title="Permalink to this headline">¶</a></h2>
         
     | 
| 
       165 
176 
     | 
    
         
             
            <p>"tokenize" is the process that extracts zero or more tokens from a
         
     | 
| 
       166 
177 
     | 
    
         
             
            text. There are some "tokenize" methods.</p>
         
     | 
| 
       167 
     | 
    
         
            -
            <p>For example, < 
     | 
| 
      
 178 
     | 
    
         
            +
            <p>For example, <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> is tokenized to the following tokens by
         
     | 
| 
       168 
179 
     | 
    
         
             
            bigram tokenize method:</p>
         
     | 
| 
       169 
180 
     | 
    
         
             
            <blockquote>
         
     | 
| 
       170 
181 
     | 
    
         
             
            <div><ul class="simple">
         
     | 
| 
       171 
     | 
    
         
            -
            <li>< 
     | 
| 
       172 
     | 
    
         
            -
            <li>< 
     | 
| 
       173 
     | 
    
         
            -
            <li>< 
     | 
| 
       174 
     | 
    
         
            -
            <li>< 
     | 
| 
       175 
     | 
    
         
            -
            <li>< 
     | 
| 
       176 
     | 
    
         
            -
            <li>< 
     | 
| 
       177 
     | 
    
         
            -
            <li>< 
     | 
| 
       178 
     | 
    
         
            -
            <li>< 
     | 
| 
       179 
     | 
    
         
            -
            <li>< 
     | 
| 
       180 
     | 
    
         
            -
            <li>< 
     | 
| 
      
 182 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">He</span></code></li>
         
     | 
| 
      
 183 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">el</span></code></li>
         
     | 
| 
      
 184 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">ll</span></code></li>
         
     | 
| 
      
 185 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">lo</span></code></li>
         
     | 
| 
      
 186 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">o_</span></code> (<code class="docutils literal"><span class="pre">_</span></code> means a white-space)</li>
         
     | 
| 
      
 187 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">_W</span></code> (<code class="docutils literal"><span class="pre">_</span></code> means a white-space)</li>
         
     | 
| 
      
 188 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">Wo</span></code></li>
         
     | 
| 
      
 189 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">or</span></code></li>
         
     | 
| 
      
 190 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">rl</span></code></li>
         
     | 
| 
      
 191 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">ld</span></code></li>
         
     | 
| 
       181 
192 
     | 
    
         
             
            </ul>
         
     | 
| 
       182 
193 
     | 
    
         
             
            </div></blockquote>
         
     | 
| 
       183 
     | 
    
         
            -
            <p>In the above example, 10 tokens are extracted from one text < 
     | 
| 
       184 
     | 
    
         
            -
            <span class="pre">World</span></ 
     | 
| 
       185 
     | 
    
         
            -
            <p>For example, < 
     | 
| 
      
 194 
     | 
    
         
            +
            <p>In the above example, 10 tokens are extracted from one text <code class="docutils literal"><span class="pre">Hello</span>
         
     | 
| 
      
 195 
     | 
    
         
            +
            <span class="pre">World</span></code>.</p>
         
     | 
| 
      
 196 
     | 
    
         
            +
            <p>For example, <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> is tokenized to the following tokens by
         
     | 
| 
       186 
197 
     | 
    
         
             
            white-space-separate tokenize method:</p>
         
     | 
| 
       187 
198 
     | 
    
         
             
            <blockquote>
         
     | 
| 
       188 
199 
     | 
    
         
             
            <div><ul class="simple">
         
     | 
| 
       189 
     | 
    
         
            -
            <li>< 
     | 
| 
       190 
     | 
    
         
            -
            <li>< 
     | 
| 
      
 200 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">Hello</span></code></li>
         
     | 
| 
      
 201 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">World</span></code></li>
         
     | 
| 
       191 
202 
     | 
    
         
             
            </ul>
         
     | 
| 
       192 
203 
     | 
    
         
             
            </div></blockquote>
         
     | 
| 
       193 
     | 
    
         
            -
            <p>In the above example, 2 tokens are extracted from one text < 
     | 
| 
       194 
     | 
    
         
            -
            <span class="pre">World</span></ 
     | 
| 
      
 204 
     | 
    
         
            +
            <p>In the above example, 2 tokens are extracted from one text <code class="docutils literal"><span class="pre">Hello</span>
         
     | 
| 
      
 205 
     | 
    
         
            +
            <span class="pre">World</span></code>.</p>
         
     | 
| 
       195 
206 
     | 
    
         
             
            <p>Token is used as search key. You can find indexed documents only by
         
     | 
| 
       196 
207 
     | 
    
         
             
            tokens that are extracted by used tokenize method. For example, you
         
     | 
| 
       197 
     | 
    
         
            -
            can find < 
     | 
| 
       198 
     | 
    
         
            -
            can't find < 
     | 
| 
      
 208 
     | 
    
         
            +
            can find <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> by <code class="docutils literal"><span class="pre">ll</span></code> with bigram tokenize method but you
         
     | 
| 
      
 209 
     | 
    
         
            +
            can't find <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> by <code class="docutils literal"><span class="pre">ll</span></code> with white-space-separate tokenize
         
     | 
| 
       199 
210 
     | 
    
         
             
            method. Because white-space-separate tokenize method doesn't extract
         
     | 
| 
       200 
     | 
    
         
            -
            < 
     | 
| 
      
 211 
     | 
    
         
            +
            <code class="docutils literal"><span class="pre">ll</span></code> token. It just extracts <code class="docutils literal"><span class="pre">Hello</span></code> and <code class="docutils literal"><span class="pre">World</span></code> tokens.</p>
         
     | 
| 
       201 
212 
     | 
    
         
             
            <p>In general, tokenize method that generates small tokens increases
         
     | 
| 
       202 
213 
     | 
    
         
             
            recall but decreases precision. Tokenize method that generates large
         
     | 
| 
       203 
214 
     | 
    
         
             
            tokens increases precision but decreases recall.</p>
         
     | 
| 
       204 
     | 
    
         
            -
            <p>For example, we can find < 
     | 
| 
       205 
     | 
    
         
            -
            bigram tokenize method. < 
     | 
| 
      
 215 
     | 
    
         
            +
            <p>For example, we can find <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> and <code class="docutils literal"><span class="pre">A</span> <span class="pre">or</span> <span class="pre">B</span></code> by <code class="docutils literal"><span class="pre">or</span></code> with
         
     | 
| 
      
 216 
     | 
    
         
            +
            bigram tokenize method. <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> is a noise for people who
         
     | 
| 
       206 
217 
     | 
    
         
             
            wants to search "logical and". It means that precision is
         
     | 
| 
       207 
218 
     | 
    
         
             
            decreased. But recall is increased.</p>
         
     | 
| 
       208 
     | 
    
         
            -
            <p>We can find only < 
     | 
| 
       209 
     | 
    
         
            -
            tokenize method. Because < 
     | 
| 
      
 219 
     | 
    
         
            +
            <p>We can find only <code class="docutils literal"><span class="pre">A</span> <span class="pre">or</span> <span class="pre">B</span></code> by <code class="docutils literal"><span class="pre">or</span></code> with white-space-separate
         
     | 
| 
      
 220 
     | 
    
         
            +
            tokenize method. Because <code class="docutils literal"><span class="pre">World</span></code> is tokenized to one token <code class="docutils literal"><span class="pre">World</span></code>
         
     | 
| 
       210 
221 
     | 
    
         
             
            with white-space-separate tokenize method. It means that precision is
         
     | 
| 
       211 
222 
     | 
    
         
             
            increased for people who wants to search "logical and". But recall is
         
     | 
| 
       212 
     | 
    
         
            -
            decreased because < 
     | 
| 
      
 223 
     | 
    
         
            +
            decreased because <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> that contains <code class="docutils literal"><span class="pre">or</span></code> isn't found.</p>
         
     | 
| 
       213 
224 
     | 
    
         
             
            </div>
         
     | 
| 
       214 
225 
     | 
    
         
             
            <div class="section" id="built-in-tokenizsers">
         
     | 
| 
       215 
226 
     | 
    
         
             
            <h2>7.8.3. Built-in tokenizsers<a class="headerlink" href="#built-in-tokenizsers" title="Permalink to this headline">¶</a></h2>
         
     | 
| 
       216 
227 
     | 
    
         
             
            <p>Here is a list of built-in tokenizers:</p>
         
     | 
| 
       217 
228 
     | 
    
         
             
            <blockquote>
         
     | 
| 
       218 
229 
     | 
    
         
             
            <div><ul class="simple">
         
     | 
| 
       219 
     | 
    
         
            -
            <li>< 
     | 
| 
       220 
     | 
    
         
            -
            <li>< 
     | 
| 
       221 
     | 
    
         
            -
            <li>< 
     | 
| 
       222 
     | 
    
         
            -
            <li>< 
     | 
| 
       223 
     | 
    
         
            -
            <li>< 
     | 
| 
       224 
     | 
    
         
            -
            <li>< 
     | 
| 
       225 
     | 
    
         
            -
            <li>< 
     | 
| 
       226 
     | 
    
         
            -
            <li>< 
     | 
| 
       227 
     | 
    
         
            -
            <li>< 
     | 
| 
       228 
     | 
    
         
            -
            <li>< 
     | 
| 
       229 
     | 
    
         
            -
            <li>< 
     | 
| 
       230 
     | 
    
         
            -
            <li>< 
     | 
| 
       231 
     | 
    
         
            -
            <li>< 
     | 
| 
       232 
     | 
    
         
            -
            <li>< 
     | 
| 
      
 230 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenBigram</span></code></li>
         
     | 
| 
      
 231 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenBigramSplitSymbol</span></code></li>
         
     | 
| 
      
 232 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlpha</span></code></li>
         
     | 
| 
      
 233 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlphaDigit</span></code></li>
         
     | 
| 
      
 234 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlank</span></code></li>
         
     | 
| 
      
 235 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbol</span></code></li>
         
     | 
| 
      
 236 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitAlpha</span></code></li>
         
     | 
| 
      
 237 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitAlphaDigit</span></code></li>
         
     | 
| 
      
 238 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenUnigram</span></code></li>
         
     | 
| 
      
 239 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenTrigram</span></code></li>
         
     | 
| 
      
 240 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenDelimit</span></code></li>
         
     | 
| 
      
 241 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenDelimitNull</span></code></li>
         
     | 
| 
      
 242 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenMecab</span></code></li>
         
     | 
| 
      
 243 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">TokenRegexp</span></code></li>
         
     | 
| 
       233 
244 
     | 
    
         
             
            </ul>
         
     | 
| 
       234 
245 
     | 
    
         
             
            </div></blockquote>
         
     | 
| 
       235 
246 
     | 
    
         
             
            <div class="section" id="tokenbigram">
         
     | 
| 
       236 
     | 
    
         
            -
            <span id="token-bigram"></span><h3>7.8.3.1. < 
     | 
| 
       237 
     | 
    
         
            -
            <p>< 
     | 
| 
      
 247 
     | 
    
         
            +
            <span id="token-bigram"></span><h3>7.8.3.1. <code class="docutils literal"><span class="pre">TokenBigram</span></code><a class="headerlink" href="#tokenbigram" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
      
 248 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigram</span></code> is a bigram based tokenizer. It's recommended to use
         
     | 
| 
       238 
249 
     | 
    
         
             
            this tokenizer for most cases.</p>
         
     | 
| 
       239 
250 
     | 
    
         
             
            <p>Bigram tokenize method tokenizes a text to two adjacent characters
         
     | 
| 
       240 
     | 
    
         
            -
            tokens. For example, < 
     | 
| 
      
 251 
     | 
    
         
            +
            tokens. For example, <code class="docutils literal"><span class="pre">Hello</span></code> is tokenized to the following tokens:</p>
         
     | 
| 
       241 
252 
     | 
    
         
             
            <blockquote>
         
     | 
| 
       242 
253 
     | 
    
         
             
            <div><ul class="simple">
         
     | 
| 
       243 
     | 
    
         
            -
            <li>< 
     | 
| 
       244 
     | 
    
         
            -
            <li>< 
     | 
| 
       245 
     | 
    
         
            -
            <li>< 
     | 
| 
       246 
     | 
    
         
            -
            <li>< 
     | 
| 
      
 254 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">He</span></code></li>
         
     | 
| 
      
 255 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">el</span></code></li>
         
     | 
| 
      
 256 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">ll</span></code></li>
         
     | 
| 
      
 257 
     | 
    
         
            +
            <li><code class="docutils literal"><span class="pre">lo</span></code></li>
         
     | 
| 
       247 
258 
     | 
    
         
             
            </ul>
         
     | 
| 
       248 
259 
     | 
    
         
             
            </div></blockquote>
         
     | 
| 
       249 
260 
     | 
    
         
             
            <p>Bigram tokenize method is good for recall because you can find all
         
     | 
| 
         @@ -252,15 +263,15 @@ texts by query consists of two or more characters.</p> 
     | 
|
| 
       252 
263 
     | 
    
         
             
            character because one character token doesn't exist. But you can find
         
     | 
| 
       253 
264 
     | 
    
         
             
            all texts by query consists of one character in Groonga. Because
         
     | 
| 
       254 
265 
     | 
    
         
             
            Groonga find tokens that start with query by predictive search. For
         
     | 
| 
       255 
     | 
    
         
            -
            example, Groonga can find < 
     | 
| 
      
 266 
     | 
    
         
            +
            example, Groonga can find <code class="docutils literal"><span class="pre">ll</span></code> and <code class="docutils literal"><span class="pre">lo</span></code> tokens by <code class="docutils literal"><span class="pre">l</span></code> query.</p>
         
     | 
| 
       256 
267 
     | 
    
         
             
            <p>Bigram tokenize method isn't good for precision because you can find
         
     | 
| 
       257 
     | 
    
         
            -
            texts that includes query in word. For example, you can find < 
     | 
| 
       258 
     | 
    
         
            -
            by < 
     | 
| 
       259 
     | 
    
         
            -
            non-ASCII languages. < 
     | 
| 
      
 268 
     | 
    
         
            +
            texts that includes query in word. For example, you can find <code class="docutils literal"><span class="pre">world</span></code>
         
     | 
| 
      
 269 
     | 
    
         
            +
            by <code class="docutils literal"><span class="pre">or</span></code>. This is more sensitive for ASCII only languages rather than
         
     | 
| 
      
 270 
     | 
    
         
            +
            non-ASCII languages. <code class="docutils literal"><span class="pre">TokenBigram</span></code> has solution for this problem
         
     | 
| 
       260 
271 
     | 
    
         
             
            described in the bellow.</p>
         
     | 
| 
       261 
     | 
    
         
            -
            <p>< 
     | 
| 
      
 272 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigram</span></code> behavior is different when it's worked with any
         
     | 
| 
       262 
273 
     | 
    
         
             
            <a class="reference internal" href="normalizers.html"><em>Normalizers</em></a>.</p>
         
     | 
| 
       263 
     | 
    
         
            -
            <p>If no normalizer is used, < 
     | 
| 
      
 274 
     | 
    
         
            +
            <p>If no normalizer is used, <code class="docutils literal"><span class="pre">TokenBigram</span></code> uses pure bigram (all tokens
         
     | 
| 
       264 
275 
     | 
    
         
             
            except the last token have two characters) tokenize method:</p>
         
     | 
| 
       265 
276 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       266 
277 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenBigram "Hello World"
         
     | 
| 
         @@ -273,54 +284,65 @@ except the last token have two characters) tokenize method:</p> 
     | 
|
| 
       273 
284 
     | 
    
         
             
            #   [
         
     | 
| 
       274 
285 
     | 
    
         
             
            #     {
         
     | 
| 
       275 
286 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 287 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       276 
288 
     | 
    
         
             
            #       "value": "He"
         
     | 
| 
       277 
289 
     | 
    
         
             
            #     },
         
     | 
| 
       278 
290 
     | 
    
         
             
            #     {
         
     | 
| 
       279 
291 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 292 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       280 
293 
     | 
    
         
             
            #       "value": "el"
         
     | 
| 
       281 
294 
     | 
    
         
             
            #     },
         
     | 
| 
       282 
295 
     | 
    
         
             
            #     {
         
     | 
| 
       283 
296 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 297 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       284 
298 
     | 
    
         
             
            #       "value": "ll"
         
     | 
| 
       285 
299 
     | 
    
         
             
            #     },
         
     | 
| 
       286 
300 
     | 
    
         
             
            #     {
         
     | 
| 
       287 
301 
     | 
    
         
             
            #       "position": 3,
         
     | 
| 
      
 302 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       288 
303 
     | 
    
         
             
            #       "value": "lo"
         
     | 
| 
       289 
304 
     | 
    
         
             
            #     },
         
     | 
| 
       290 
305 
     | 
    
         
             
            #     {
         
     | 
| 
       291 
306 
     | 
    
         
             
            #       "position": 4,
         
     | 
| 
      
 307 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       292 
308 
     | 
    
         
             
            #       "value": "o "
         
     | 
| 
       293 
309 
     | 
    
         
             
            #     },
         
     | 
| 
       294 
310 
     | 
    
         
             
            #     {
         
     | 
| 
       295 
311 
     | 
    
         
             
            #       "position": 5,
         
     | 
| 
      
 312 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       296 
313 
     | 
    
         
             
            #       "value": " W"
         
     | 
| 
       297 
314 
     | 
    
         
             
            #     },
         
     | 
| 
       298 
315 
     | 
    
         
             
            #     {
         
     | 
| 
       299 
316 
     | 
    
         
             
            #       "position": 6,
         
     | 
| 
      
 317 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       300 
318 
     | 
    
         
             
            #       "value": "Wo"
         
     | 
| 
       301 
319 
     | 
    
         
             
            #     },
         
     | 
| 
       302 
320 
     | 
    
         
             
            #     {
         
     | 
| 
       303 
321 
     | 
    
         
             
            #       "position": 7,
         
     | 
| 
      
 322 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       304 
323 
     | 
    
         
             
            #       "value": "or"
         
     | 
| 
       305 
324 
     | 
    
         
             
            #     },
         
     | 
| 
       306 
325 
     | 
    
         
             
            #     {
         
     | 
| 
       307 
326 
     | 
    
         
             
            #       "position": 8,
         
     | 
| 
      
 327 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       308 
328 
     | 
    
         
             
            #       "value": "rl"
         
     | 
| 
       309 
329 
     | 
    
         
             
            #     },
         
     | 
| 
       310 
330 
     | 
    
         
             
            #     {
         
     | 
| 
       311 
331 
     | 
    
         
             
            #       "position": 9,
         
     | 
| 
      
 332 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       312 
333 
     | 
    
         
             
            #       "value": "ld"
         
     | 
| 
       313 
334 
     | 
    
         
             
            #     },
         
     | 
| 
       314 
335 
     | 
    
         
             
            #     {
         
     | 
| 
       315 
336 
     | 
    
         
             
            #       "position": 10,
         
     | 
| 
      
 337 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       316 
338 
     | 
    
         
             
            #       "value": "d"
         
     | 
| 
       317 
339 
     | 
    
         
             
            #     }
         
     | 
| 
       318 
340 
     | 
    
         
             
            #   ]
         
     | 
| 
       319 
341 
     | 
    
         
             
            # ]
         
     | 
| 
       320 
342 
     | 
    
         
             
            </pre></div>
         
     | 
| 
       321 
343 
     | 
    
         
             
            </div>
         
     | 
| 
       322 
     | 
    
         
            -
            <p>If normalizer is used, < 
     | 
| 
       323 
     | 
    
         
            -
            tokenize method for ASCII characters. < 
     | 
| 
      
 344 
     | 
    
         
            +
            <p>If normalizer is used, <code class="docutils literal"><span class="pre">TokenBigram</span></code> uses white-space-separate like
         
     | 
| 
      
 345 
     | 
    
         
            +
            tokenize method for ASCII characters. <code class="docutils literal"><span class="pre">TokenBigram</span></code> uses bigram
         
     | 
| 
       324 
346 
     | 
    
         
             
            tokenize method for non-ASCII characters.</p>
         
     | 
| 
       325 
347 
     | 
    
         
             
            <p>You may be confused with this combined behavior. But it's reasonable
         
     | 
| 
       326 
348 
     | 
    
         
             
            for most use cases such as English text (only ASCII characters) and
         
     | 
| 
         @@ -332,10 +354,10 @@ the case.</p> 
     | 
|
| 
       332 
354 
     | 
    
         
             
            word separator. Bigram tokenize method is suitable for the case.</p>
         
     | 
| 
       333 
355 
     | 
    
         
             
            <p>Mixed tokenize method is suitable for mixed language case.</p>
         
     | 
| 
       334 
356 
     | 
    
         
             
            <p>If you want to use bigram tokenize method for ASCII character, see
         
     | 
| 
       335 
     | 
    
         
            -
            < 
     | 
| 
       336 
     | 
    
         
            -
            <a class="reference internal" href="#token-bigram-split-symbol-alpha">< 
     | 
| 
       337 
     | 
    
         
            -
            <p>Let's confirm < 
     | 
| 
       338 
     | 
    
         
            -
            <p>< 
     | 
| 
      
 357 
     | 
    
         
            +
            <code class="docutils literal"><span class="pre">TokenBigramSplitXXX</span></code> type tokenizers such as
         
     | 
| 
      
 358 
     | 
    
         
            +
            <a class="reference internal" href="#token-bigram-split-symbol-alpha"><span>TokenBigramSplitSymbolAlpha</span></a>.</p>
         
     | 
| 
      
 359 
     | 
    
         
            +
            <p>Let's confirm <code class="docutils literal"><span class="pre">TokenBigram</span></code> behavior by example.</p>
         
     | 
| 
      
 360 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigram</span></code> uses one or more white-spaces as token delimiter for
         
     | 
| 
       339 
361 
     | 
    
         
             
            ASCII characters:</p>
         
     | 
| 
       340 
362 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       341 
363 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenBigram "Hello World" NormalizerAuto
         
     | 
| 
         @@ -348,23 +370,25 @@ ASCII characters:</p> 
     | 
|
| 
       348 
370 
     | 
    
         
             
            #   [
         
     | 
| 
       349 
371 
     | 
    
         
             
            #     {
         
     | 
| 
       350 
372 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 373 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       351 
374 
     | 
    
         
             
            #       "value": "hello"
         
     | 
| 
       352 
375 
     | 
    
         
             
            #     },
         
     | 
| 
       353 
376 
     | 
    
         
             
            #     {
         
     | 
| 
       354 
377 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 378 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       355 
379 
     | 
    
         
             
            #       "value": "world"
         
     | 
| 
       356 
380 
     | 
    
         
             
            #     }
         
     | 
| 
       357 
381 
     | 
    
         
             
            #   ]
         
     | 
| 
       358 
382 
     | 
    
         
             
            # ]
         
     | 
| 
       359 
383 
     | 
    
         
             
            </pre></div>
         
     | 
| 
       360 
384 
     | 
    
         
             
            </div>
         
     | 
| 
       361 
     | 
    
         
            -
            <p>< 
     | 
| 
      
 385 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigram</span></code> uses character type change as token delimiter for
         
     | 
| 
       362 
386 
     | 
    
         
             
            ASCII characters. Character type is one of them:</p>
         
     | 
| 
       363 
387 
     | 
    
         
             
            <blockquote>
         
     | 
| 
       364 
388 
     | 
    
         
             
            <div><ul class="simple">
         
     | 
| 
       365 
389 
     | 
    
         
             
            <li>Alphabet</li>
         
     | 
| 
       366 
390 
     | 
    
         
             
            <li>Digit</li>
         
     | 
| 
       367 
     | 
    
         
            -
            <li>Symbol (such as < 
     | 
| 
      
 391 
     | 
    
         
            +
            <li>Symbol (such as <code class="docutils literal"><span class="pre">(</span></code>, <code class="docutils literal"><span class="pre">)</span></code> and <code class="docutils literal"><span class="pre">!</span></code>)</li>
         
     | 
| 
       368 
392 
     | 
    
         
             
            <li>Hiragana</li>
         
     | 
| 
       369 
393 
     | 
    
         
             
            <li>Katakana</li>
         
     | 
| 
       370 
394 
     | 
    
         
             
            <li>Kanji</li>
         
     | 
| 
         @@ -374,8 +398,8 @@ ASCII characters. Character type is one of them:</p> 
     | 
|
| 
       374 
398 
     | 
    
         
             
            <p>The following example shows two token delimiters:</p>
         
     | 
| 
       375 
399 
     | 
    
         
             
            <blockquote>
         
     | 
| 
       376 
400 
     | 
    
         
             
            <div><ul class="simple">
         
     | 
| 
       377 
     | 
    
         
            -
            <li>at between < 
     | 
| 
       378 
     | 
    
         
            -
            <li>at between < 
     | 
| 
      
 401 
     | 
    
         
            +
            <li>at between <code class="docutils literal"><span class="pre">100</span></code> (digits) and <code class="docutils literal"><span class="pre">cents</span></code> (alphabets)</li>
         
     | 
| 
      
 402 
     | 
    
         
            +
            <li>at between <code class="docutils literal"><span class="pre">cents</span></code> (alphabets) and <code class="docutils literal"><span class="pre">!!!</span></code> (symbols)</li>
         
     | 
| 
       379 
403 
     | 
    
         
             
            </ul>
         
     | 
| 
       380 
404 
     | 
    
         
             
            </div></blockquote>
         
     | 
| 
       381 
405 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
         @@ -389,21 +413,24 @@ ASCII characters. Character type is one of them:</p> 
     | 
|
| 
       389 
413 
     | 
    
         
             
            #   [
         
     | 
| 
       390 
414 
     | 
    
         
             
            #     {
         
     | 
| 
       391 
415 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 416 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       392 
417 
     | 
    
         
             
            #       "value": "100"
         
     | 
| 
       393 
418 
     | 
    
         
             
            #     },
         
     | 
| 
       394 
419 
     | 
    
         
             
            #     {
         
     | 
| 
       395 
420 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 421 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       396 
422 
     | 
    
         
             
            #       "value": "cents"
         
     | 
| 
       397 
423 
     | 
    
         
             
            #     },
         
     | 
| 
       398 
424 
     | 
    
         
             
            #     {
         
     | 
| 
       399 
425 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 426 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       400 
427 
     | 
    
         
             
            #       "value": "!!!"
         
     | 
| 
       401 
428 
     | 
    
         
             
            #     }
         
     | 
| 
       402 
429 
     | 
    
         
             
            #   ]
         
     | 
| 
       403 
430 
     | 
    
         
             
            # ]
         
     | 
| 
       404 
431 
     | 
    
         
             
            </pre></div>
         
     | 
| 
       405 
432 
     | 
    
         
             
            </div>
         
     | 
| 
       406 
     | 
    
         
            -
            <p>Here is an example that < 
     | 
| 
      
 433 
     | 
    
         
            +
            <p>Here is an example that <code class="docutils literal"><span class="pre">TokenBigram</span></code> uses bigram tokenize method
         
     | 
| 
       407 
434 
     | 
    
         
             
            for non-ASCII characters.</p>
         
     | 
| 
       408 
435 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       409 
436 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenBigram "日本語の勉強" NormalizerAuto
         
     | 
| 
         @@ -416,26 +443,32 @@ for non-ASCII characters.</p> 
     | 
|
| 
       416 
443 
     | 
    
         
             
            #   [
         
     | 
| 
       417 
444 
     | 
    
         
             
            #     {
         
     | 
| 
       418 
445 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 446 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       419 
447 
     | 
    
         
             
            #       "value": "日本"
         
     | 
| 
       420 
448 
     | 
    
         
             
            #     },
         
     | 
| 
       421 
449 
     | 
    
         
             
            #     {
         
     | 
| 
       422 
450 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 451 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       423 
452 
     | 
    
         
             
            #       "value": "本語"
         
     | 
| 
       424 
453 
     | 
    
         
             
            #     },
         
     | 
| 
       425 
454 
     | 
    
         
             
            #     {
         
     | 
| 
       426 
455 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 456 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       427 
457 
     | 
    
         
             
            #       "value": "語の"
         
     | 
| 
       428 
458 
     | 
    
         
             
            #     },
         
     | 
| 
       429 
459 
     | 
    
         
             
            #     {
         
     | 
| 
       430 
460 
     | 
    
         
             
            #       "position": 3,
         
     | 
| 
      
 461 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       431 
462 
     | 
    
         
             
            #       "value": "の勉"
         
     | 
| 
       432 
463 
     | 
    
         
             
            #     },
         
     | 
| 
       433 
464 
     | 
    
         
             
            #     {
         
     | 
| 
       434 
465 
     | 
    
         
             
            #       "position": 4,
         
     | 
| 
      
 466 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       435 
467 
     | 
    
         
             
            #       "value": "勉強"
         
     | 
| 
       436 
468 
     | 
    
         
             
            #     },
         
     | 
| 
       437 
469 
     | 
    
         
             
            #     {
         
     | 
| 
       438 
470 
     | 
    
         
             
            #       "position": 5,
         
     | 
| 
      
 471 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       439 
472 
     | 
    
         
             
            #       "value": "強"
         
     | 
| 
       440 
473 
     | 
    
         
             
            #     }
         
     | 
| 
       441 
474 
     | 
    
         
             
            #   ]
         
     | 
| 
         @@ -444,9 +477,9 @@ for non-ASCII characters.</p> 
     | 
|
| 
       444 
477 
     | 
    
         
             
            </div>
         
     | 
| 
       445 
478 
     | 
    
         
             
            </div>
         
     | 
| 
       446 
479 
     | 
    
         
             
            <div class="section" id="tokenbigramsplitsymbol">
         
     | 
| 
       447 
     | 
    
         
            -
            <span id="token-bigram-split-symbol"></span><h3>7.8.3.2. < 
     | 
| 
       448 
     | 
    
         
            -
            <p>< 
     | 
| 
       449 
     | 
    
         
            -
            difference between them is symbol handling. < 
     | 
| 
      
 480 
     | 
    
         
            +
            <span id="token-bigram-split-symbol"></span><h3>7.8.3.2. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbol</span></code><a class="headerlink" href="#tokenbigramsplitsymbol" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
      
 481 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigramSplitSymbol</span></code> is similar to <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a>. The
         
     | 
| 
      
 482 
     | 
    
         
            +
            difference between them is symbol handling. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbol</span></code>
         
     | 
| 
       450 
483 
     | 
    
         
             
            tokenizes symbols by bigram tokenize method:</p>
         
     | 
| 
       451 
484 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       452 
485 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenBigramSplitSymbol "100cents!!!" NormalizerAuto
         
     | 
| 
         @@ -459,22 +492,27 @@ tokenizes symbols by bigram tokenize method:</p> 
     | 
|
| 
       459 
492 
     | 
    
         
             
            #   [
         
     | 
| 
       460 
493 
     | 
    
         
             
            #     {
         
     | 
| 
       461 
494 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 495 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       462 
496 
     | 
    
         
             
            #       "value": "100"
         
     | 
| 
       463 
497 
     | 
    
         
             
            #     },
         
     | 
| 
       464 
498 
     | 
    
         
             
            #     {
         
     | 
| 
       465 
499 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 500 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       466 
501 
     | 
    
         
             
            #       "value": "cents"
         
     | 
| 
       467 
502 
     | 
    
         
             
            #     },
         
     | 
| 
       468 
503 
     | 
    
         
             
            #     {
         
     | 
| 
       469 
504 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 505 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       470 
506 
     | 
    
         
             
            #       "value": "!!"
         
     | 
| 
       471 
507 
     | 
    
         
             
            #     },
         
     | 
| 
       472 
508 
     | 
    
         
             
            #     {
         
     | 
| 
       473 
509 
     | 
    
         
             
            #       "position": 3,
         
     | 
| 
      
 510 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       474 
511 
     | 
    
         
             
            #       "value": "!!"
         
     | 
| 
       475 
512 
     | 
    
         
             
            #     },
         
     | 
| 
       476 
513 
     | 
    
         
             
            #     {
         
     | 
| 
       477 
514 
     | 
    
         
             
            #       "position": 4,
         
     | 
| 
      
 515 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       478 
516 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       479 
517 
     | 
    
         
             
            #     }
         
     | 
| 
       480 
518 
     | 
    
         
             
            #   ]
         
     | 
| 
         @@ -483,10 +521,10 @@ tokenizes symbols by bigram tokenize method:</p> 
     | 
|
| 
       483 
521 
     | 
    
         
             
            </div>
         
     | 
| 
       484 
522 
     | 
    
         
             
            </div>
         
     | 
| 
       485 
523 
     | 
    
         
             
            <div class="section" id="tokenbigramsplitsymbolalpha">
         
     | 
| 
       486 
     | 
    
         
            -
            <span id="token-bigram-split-symbol-alpha"></span><h3>7.8.3.3. < 
     | 
| 
       487 
     | 
    
         
            -
            <p>< 
     | 
| 
      
 524 
     | 
    
         
            +
            <span id="token-bigram-split-symbol-alpha"></span><h3>7.8.3.3. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlpha</span></code><a class="headerlink" href="#tokenbigramsplitsymbolalpha" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
      
 525 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlpha</span></code> is similar to <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a>. The
         
     | 
| 
       488 
526 
     | 
    
         
             
            difference between them is symbol and alphabet
         
     | 
| 
       489 
     | 
    
         
            -
            handling. < 
     | 
| 
      
 527 
     | 
    
         
            +
            handling. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlpha</span></code> tokenizes symbols and
         
     | 
| 
       490 
528 
     | 
    
         
             
            alphabets by bigram tokenize method:</p>
         
     | 
| 
       491 
529 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       492 
530 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenBigramSplitSymbolAlpha "100cents!!!" NormalizerAuto
         
     | 
| 
         @@ -499,38 +537,47 @@ alphabets by bigram tokenize method:</p> 
     | 
|
| 
       499 
537 
     | 
    
         
             
            #   [
         
     | 
| 
       500 
538 
     | 
    
         
             
            #     {
         
     | 
| 
       501 
539 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 540 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       502 
541 
     | 
    
         
             
            #       "value": "100"
         
     | 
| 
       503 
542 
     | 
    
         
             
            #     },
         
     | 
| 
       504 
543 
     | 
    
         
             
            #     {
         
     | 
| 
       505 
544 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 545 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       506 
546 
     | 
    
         
             
            #       "value": "ce"
         
     | 
| 
       507 
547 
     | 
    
         
             
            #     },
         
     | 
| 
       508 
548 
     | 
    
         
             
            #     {
         
     | 
| 
       509 
549 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 550 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       510 
551 
     | 
    
         
             
            #       "value": "en"
         
     | 
| 
       511 
552 
     | 
    
         
             
            #     },
         
     | 
| 
       512 
553 
     | 
    
         
             
            #     {
         
     | 
| 
       513 
554 
     | 
    
         
             
            #       "position": 3,
         
     | 
| 
      
 555 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       514 
556 
     | 
    
         
             
            #       "value": "nt"
         
     | 
| 
       515 
557 
     | 
    
         
             
            #     },
         
     | 
| 
       516 
558 
     | 
    
         
             
            #     {
         
     | 
| 
       517 
559 
     | 
    
         
             
            #       "position": 4,
         
     | 
| 
      
 560 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       518 
561 
     | 
    
         
             
            #       "value": "ts"
         
     | 
| 
       519 
562 
     | 
    
         
             
            #     },
         
     | 
| 
       520 
563 
     | 
    
         
             
            #     {
         
     | 
| 
       521 
564 
     | 
    
         
             
            #       "position": 5,
         
     | 
| 
      
 565 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       522 
566 
     | 
    
         
             
            #       "value": "s!"
         
     | 
| 
       523 
567 
     | 
    
         
             
            #     },
         
     | 
| 
       524 
568 
     | 
    
         
             
            #     {
         
     | 
| 
       525 
569 
     | 
    
         
             
            #       "position": 6,
         
     | 
| 
      
 570 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       526 
571 
     | 
    
         
             
            #       "value": "!!"
         
     | 
| 
       527 
572 
     | 
    
         
             
            #     },
         
     | 
| 
       528 
573 
     | 
    
         
             
            #     {
         
     | 
| 
       529 
574 
     | 
    
         
             
            #       "position": 7,
         
     | 
| 
      
 575 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       530 
576 
     | 
    
         
             
            #       "value": "!!"
         
     | 
| 
       531 
577 
     | 
    
         
             
            #     },
         
     | 
| 
       532 
578 
     | 
    
         
             
            #     {
         
     | 
| 
       533 
579 
     | 
    
         
             
            #       "position": 8,
         
     | 
| 
      
 580 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       534 
581 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       535 
582 
     | 
    
         
             
            #     }
         
     | 
| 
       536 
583 
     | 
    
         
             
            #   ]
         
     | 
| 
         @@ -539,10 +586,10 @@ alphabets by bigram tokenize method:</p> 
     | 
|
| 
       539 
586 
     | 
    
         
             
            </div>
         
     | 
| 
       540 
587 
     | 
    
         
             
            </div>
         
     | 
| 
       541 
588 
     | 
    
         
             
            <div class="section" id="tokenbigramsplitsymbolalphadigit">
         
     | 
| 
       542 
     | 
    
         
            -
            <span id="token-bigram-split-symbol-alpha-digit"></span><h3>7.8.3.4. < 
     | 
| 
       543 
     | 
    
         
            -
            <p>< 
     | 
| 
       544 
     | 
    
         
            -
            <a class="reference internal" href="#token-bigram">< 
     | 
| 
       545 
     | 
    
         
            -
            and digit handling. < 
     | 
| 
      
 589 
     | 
    
         
            +
            <span id="token-bigram-split-symbol-alpha-digit"></span><h3>7.8.3.4. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlphaDigit</span></code><a class="headerlink" href="#tokenbigramsplitsymbolalphadigit" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
      
 590 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlphaDigit</span></code> is similar to
         
     | 
| 
      
 591 
     | 
    
         
            +
            <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a>. The difference between them is symbol, alphabet
         
     | 
| 
      
 592 
     | 
    
         
            +
            and digit handling. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlphaDigit</span></code> tokenizes
         
     | 
| 
       546 
593 
     | 
    
         
             
            symbols, alphabets and digits by bigram tokenize method. It means that
         
     | 
| 
       547 
594 
     | 
    
         
             
            all characters are tokenized by bigram tokenize method:</p>
         
     | 
| 
       548 
595 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
         @@ -556,46 +603,57 @@ all characters are tokenized by bigram tokenize method:</p> 
     | 
|
| 
       556 
603 
     | 
    
         
             
            #   [
         
     | 
| 
       557 
604 
     | 
    
         
             
            #     {
         
     | 
| 
       558 
605 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 606 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       559 
607 
     | 
    
         
             
            #       "value": "10"
         
     | 
| 
       560 
608 
     | 
    
         
             
            #     },
         
     | 
| 
       561 
609 
     | 
    
         
             
            #     {
         
     | 
| 
       562 
610 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 611 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       563 
612 
     | 
    
         
             
            #       "value": "00"
         
     | 
| 
       564 
613 
     | 
    
         
             
            #     },
         
     | 
| 
       565 
614 
     | 
    
         
             
            #     {
         
     | 
| 
       566 
615 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 616 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       567 
617 
     | 
    
         
             
            #       "value": "0c"
         
     | 
| 
       568 
618 
     | 
    
         
             
            #     },
         
     | 
| 
       569 
619 
     | 
    
         
             
            #     {
         
     | 
| 
       570 
620 
     | 
    
         
             
            #       "position": 3,
         
     | 
| 
      
 621 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       571 
622 
     | 
    
         
             
            #       "value": "ce"
         
     | 
| 
       572 
623 
     | 
    
         
             
            #     },
         
     | 
| 
       573 
624 
     | 
    
         
             
            #     {
         
     | 
| 
       574 
625 
     | 
    
         
             
            #       "position": 4,
         
     | 
| 
      
 626 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       575 
627 
     | 
    
         
             
            #       "value": "en"
         
     | 
| 
       576 
628 
     | 
    
         
             
            #     },
         
     | 
| 
       577 
629 
     | 
    
         
             
            #     {
         
     | 
| 
       578 
630 
     | 
    
         
             
            #       "position": 5,
         
     | 
| 
      
 631 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       579 
632 
     | 
    
         
             
            #       "value": "nt"
         
     | 
| 
       580 
633 
     | 
    
         
             
            #     },
         
     | 
| 
       581 
634 
     | 
    
         
             
            #     {
         
     | 
| 
       582 
635 
     | 
    
         
             
            #       "position": 6,
         
     | 
| 
      
 636 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       583 
637 
     | 
    
         
             
            #       "value": "ts"
         
     | 
| 
       584 
638 
     | 
    
         
             
            #     },
         
     | 
| 
       585 
639 
     | 
    
         
             
            #     {
         
     | 
| 
       586 
640 
     | 
    
         
             
            #       "position": 7,
         
     | 
| 
      
 641 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       587 
642 
     | 
    
         
             
            #       "value": "s!"
         
     | 
| 
       588 
643 
     | 
    
         
             
            #     },
         
     | 
| 
       589 
644 
     | 
    
         
             
            #     {
         
     | 
| 
       590 
645 
     | 
    
         
             
            #       "position": 8,
         
     | 
| 
      
 646 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       591 
647 
     | 
    
         
             
            #       "value": "!!"
         
     | 
| 
       592 
648 
     | 
    
         
             
            #     },
         
     | 
| 
       593 
649 
     | 
    
         
             
            #     {
         
     | 
| 
       594 
650 
     | 
    
         
             
            #       "position": 9,
         
     | 
| 
      
 651 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       595 
652 
     | 
    
         
             
            #       "value": "!!"
         
     | 
| 
       596 
653 
     | 
    
         
             
            #     },
         
     | 
| 
       597 
654 
     | 
    
         
             
            #     {
         
     | 
| 
       598 
655 
     | 
    
         
             
            #       "position": 10,
         
     | 
| 
      
 656 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       599 
657 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       600 
658 
     | 
    
         
             
            #     }
         
     | 
| 
       601 
659 
     | 
    
         
             
            #   ]
         
     | 
| 
         @@ -604,13 +662,13 @@ all characters are tokenized by bigram tokenize method:</p> 
     | 
|
| 
       604 
662 
     | 
    
         
             
            </div>
         
     | 
| 
       605 
663 
     | 
    
         
             
            </div>
         
     | 
| 
       606 
664 
     | 
    
         
             
            <div class="section" id="tokenbigramignoreblank">
         
     | 
| 
       607 
     | 
    
         
            -
            <span id="token-bigram-ignore-blank"></span><h3>7.8.3.5. < 
     | 
| 
       608 
     | 
    
         
            -
            <p>< 
     | 
| 
       609 
     | 
    
         
            -
            difference between them is blank handling. < 
     | 
| 
      
 665 
     | 
    
         
            +
            <span id="token-bigram-ignore-blank"></span><h3>7.8.3.5. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlank</span></code><a class="headerlink" href="#tokenbigramignoreblank" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
      
 666 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlank</span></code> is similar to <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a>. The
         
     | 
| 
      
 667 
     | 
    
         
            +
            difference between them is blank handling. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlank</span></code>
         
     | 
| 
       610 
668 
     | 
    
         
             
            ignores white-spaces in continuous symbols and non-ASCII characters.</p>
         
     | 
| 
       611 
     | 
    
         
            -
            <p>You can find difference of them by < 
     | 
| 
      
 669 
     | 
    
         
            +
            <p>You can find difference of them by <code class="docutils literal"><span class="pre">日</span> <span class="pre">本</span> <span class="pre">語</span> <span class="pre">!</span> <span class="pre">!</span> <span class="pre">!</span></code> text because it
         
     | 
| 
       612 
670 
     | 
    
         
             
            has symbols and non-ASCII characters.</p>
         
     | 
| 
       613 
     | 
    
         
            -
            <p>Here is a result by <a class="reference internal" href="#token-bigram">< 
     | 
| 
      
 671 
     | 
    
         
            +
            <p>Here is a result by <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a> :</p>
         
     | 
| 
       614 
672 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       615 
673 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenBigram "日 本 語 ! ! !" NormalizerAuto
         
     | 
| 
       616 
674 
     | 
    
         
             
            # [
         
     | 
| 
         @@ -622,33 +680,39 @@ has symbols and non-ASCII characters.</p> 
     | 
|
| 
       622 
680 
     | 
    
         
             
            #   [
         
     | 
| 
       623 
681 
     | 
    
         
             
            #     {
         
     | 
| 
       624 
682 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 683 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       625 
684 
     | 
    
         
             
            #       "value": "日"
         
     | 
| 
       626 
685 
     | 
    
         
             
            #     },
         
     | 
| 
       627 
686 
     | 
    
         
             
            #     {
         
     | 
| 
       628 
687 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 688 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       629 
689 
     | 
    
         
             
            #       "value": "本"
         
     | 
| 
       630 
690 
     | 
    
         
             
            #     },
         
     | 
| 
       631 
691 
     | 
    
         
             
            #     {
         
     | 
| 
       632 
692 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 693 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       633 
694 
     | 
    
         
             
            #       "value": "語"
         
     | 
| 
       634 
695 
     | 
    
         
             
            #     },
         
     | 
| 
       635 
696 
     | 
    
         
             
            #     {
         
     | 
| 
       636 
697 
     | 
    
         
             
            #       "position": 3,
         
     | 
| 
      
 698 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       637 
699 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       638 
700 
     | 
    
         
             
            #     },
         
     | 
| 
       639 
701 
     | 
    
         
             
            #     {
         
     | 
| 
       640 
702 
     | 
    
         
             
            #       "position": 4,
         
     | 
| 
      
 703 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       641 
704 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       642 
705 
     | 
    
         
             
            #     },
         
     | 
| 
       643 
706 
     | 
    
         
             
            #     {
         
     | 
| 
       644 
707 
     | 
    
         
             
            #       "position": 5,
         
     | 
| 
      
 708 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       645 
709 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       646 
710 
     | 
    
         
             
            #     }
         
     | 
| 
       647 
711 
     | 
    
         
             
            #   ]
         
     | 
| 
       648 
712 
     | 
    
         
             
            # ]
         
     | 
| 
       649 
713 
     | 
    
         
             
            </pre></div>
         
     | 
| 
       650 
714 
     | 
    
         
             
            </div>
         
     | 
| 
       651 
     | 
    
         
            -
            <p>Here is a result by < 
     | 
| 
      
 715 
     | 
    
         
            +
            <p>Here is a result by <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlank</span></code>:</p>
         
     | 
| 
       652 
716 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       653 
717 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenBigramIgnoreBlank "日 本 語 ! ! !" NormalizerAuto
         
     | 
| 
       654 
718 
     | 
    
         
             
            # [
         
     | 
| 
         @@ -660,18 +724,22 @@ has symbols and non-ASCII characters.</p> 
     | 
|
| 
       660 
724 
     | 
    
         
             
            #   [
         
     | 
| 
       661 
725 
     | 
    
         
             
            #     {
         
     | 
| 
       662 
726 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 727 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       663 
728 
     | 
    
         
             
            #       "value": "日本"
         
     | 
| 
       664 
729 
     | 
    
         
             
            #     },
         
     | 
| 
       665 
730 
     | 
    
         
             
            #     {
         
     | 
| 
       666 
731 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 732 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       667 
733 
     | 
    
         
             
            #       "value": "本語"
         
     | 
| 
       668 
734 
     | 
    
         
             
            #     },
         
     | 
| 
       669 
735 
     | 
    
         
             
            #     {
         
     | 
| 
       670 
736 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 737 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       671 
738 
     | 
    
         
             
            #       "value": "語"
         
     | 
| 
       672 
739 
     | 
    
         
             
            #     },
         
     | 
| 
       673 
740 
     | 
    
         
             
            #     {
         
     | 
| 
       674 
741 
     | 
    
         
             
            #       "position": 3,
         
     | 
| 
      
 742 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       675 
743 
     | 
    
         
             
            #       "value": "!!!"
         
     | 
| 
       676 
744 
     | 
    
         
             
            #     }
         
     | 
| 
       677 
745 
     | 
    
         
             
            #   ]
         
     | 
| 
         @@ -680,22 +748,22 @@ has symbols and non-ASCII characters.</p> 
     | 
|
| 
       680 
748 
     | 
    
         
             
            </div>
         
     | 
| 
       681 
749 
     | 
    
         
             
            </div>
         
     | 
| 
       682 
750 
     | 
    
         
             
            <div class="section" id="tokenbigramignoreblanksplitsymbol">
         
     | 
| 
       683 
     | 
    
         
            -
            <span id="token-bigram-ignore-blank-split-symbol"></span><h3>7.8.3.6. < 
     | 
| 
       684 
     | 
    
         
            -
            <p>< 
     | 
| 
       685 
     | 
    
         
            -
            <a class="reference internal" href="#token-bigram">< 
     | 
| 
      
 751 
     | 
    
         
            +
            <span id="token-bigram-ignore-blank-split-symbol"></span><h3>7.8.3.6. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbol</span></code><a class="headerlink" href="#tokenbigramignoreblanksplitsymbol" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
      
 752 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbol</span></code> is similar to
         
     | 
| 
      
 753 
     | 
    
         
            +
            <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a>. The differences between them are the followings:</p>
         
     | 
| 
       686 
754 
     | 
    
         
             
            <blockquote>
         
     | 
| 
       687 
755 
     | 
    
         
             
            <div><ul class="simple">
         
     | 
| 
       688 
756 
     | 
    
         
             
            <li>Blank handling</li>
         
     | 
| 
       689 
757 
     | 
    
         
             
            <li>Symbol handling</li>
         
     | 
| 
       690 
758 
     | 
    
         
             
            </ul>
         
     | 
| 
       691 
759 
     | 
    
         
             
            </div></blockquote>
         
     | 
| 
       692 
     | 
    
         
            -
            <p>< 
     | 
| 
      
 760 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbol</span></code> ignores white-spaces in
         
     | 
| 
       693 
761 
     | 
    
         
             
            continuous symbols and non-ASCII characters.</p>
         
     | 
| 
       694 
     | 
    
         
            -
            <p>< 
     | 
| 
      
 762 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbol</span></code> tokenizes symbols by bigram
         
     | 
| 
       695 
763 
     | 
    
         
             
            tokenize method.</p>
         
     | 
| 
       696 
     | 
    
         
            -
            <p>You can find difference of them by < 
     | 
| 
      
 764 
     | 
    
         
            +
            <p>You can find difference of them by <code class="docutils literal"><span class="pre">日</span> <span class="pre">本</span> <span class="pre">語</span> <span class="pre">!</span> <span class="pre">!</span> <span class="pre">!</span></code> text because it
         
     | 
| 
       697 
765 
     | 
    
         
             
            has symbols and non-ASCII characters.</p>
         
     | 
| 
       698 
     | 
    
         
            -
            <p>Here is a result by <a class="reference internal" href="#token-bigram">< 
     | 
| 
      
 766 
     | 
    
         
            +
            <p>Here is a result by <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a> :</p>
         
     | 
| 
       699 
767 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       700 
768 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenBigram "日 本 語 ! ! !" NormalizerAuto
         
     | 
| 
       701 
769 
     | 
    
         
             
            # [
         
     | 
| 
         @@ -707,33 +775,39 @@ has symbols and non-ASCII characters.</p> 
     | 
|
| 
       707 
775 
     | 
    
         
             
            #   [
         
     | 
| 
       708 
776 
     | 
    
         
             
            #     {
         
     | 
| 
       709 
777 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 778 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       710 
779 
     | 
    
         
             
            #       "value": "日"
         
     | 
| 
       711 
780 
     | 
    
         
             
            #     },
         
     | 
| 
       712 
781 
     | 
    
         
             
            #     {
         
     | 
| 
       713 
782 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 783 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       714 
784 
     | 
    
         
             
            #       "value": "本"
         
     | 
| 
       715 
785 
     | 
    
         
             
            #     },
         
     | 
| 
       716 
786 
     | 
    
         
             
            #     {
         
     | 
| 
       717 
787 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 788 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       718 
789 
     | 
    
         
             
            #       "value": "語"
         
     | 
| 
       719 
790 
     | 
    
         
             
            #     },
         
     | 
| 
       720 
791 
     | 
    
         
             
            #     {
         
     | 
| 
       721 
792 
     | 
    
         
             
            #       "position": 3,
         
     | 
| 
      
 793 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       722 
794 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       723 
795 
     | 
    
         
             
            #     },
         
     | 
| 
       724 
796 
     | 
    
         
             
            #     {
         
     | 
| 
       725 
797 
     | 
    
         
             
            #       "position": 4,
         
     | 
| 
      
 798 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       726 
799 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       727 
800 
     | 
    
         
             
            #     },
         
     | 
| 
       728 
801 
     | 
    
         
             
            #     {
         
     | 
| 
       729 
802 
     | 
    
         
             
            #       "position": 5,
         
     | 
| 
      
 803 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       730 
804 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       731 
805 
     | 
    
         
             
            #     }
         
     | 
| 
       732 
806 
     | 
    
         
             
            #   ]
         
     | 
| 
       733 
807 
     | 
    
         
             
            # ]
         
     | 
| 
       734 
808 
     | 
    
         
             
            </pre></div>
         
     | 
| 
       735 
809 
     | 
    
         
             
            </div>
         
     | 
| 
       736 
     | 
    
         
            -
            <p>Here is a result by < 
     | 
| 
      
 810 
     | 
    
         
            +
            <p>Here is a result by <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbol</span></code>:</p>
         
     | 
| 
       737 
811 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       738 
812 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenBigramIgnoreBlankSplitSymbol "日 本 語 ! ! !" NormalizerAuto
         
     | 
| 
       739 
813 
     | 
    
         
             
            # [
         
     | 
| 
         @@ -745,26 +819,32 @@ has symbols and non-ASCII characters.</p> 
     | 
|
| 
       745 
819 
     | 
    
         
             
            #   [
         
     | 
| 
       746 
820 
     | 
    
         
             
            #     {
         
     | 
| 
       747 
821 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 822 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       748 
823 
     | 
    
         
             
            #       "value": "日本"
         
     | 
| 
       749 
824 
     | 
    
         
             
            #     },
         
     | 
| 
       750 
825 
     | 
    
         
             
            #     {
         
     | 
| 
       751 
826 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 827 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       752 
828 
     | 
    
         
             
            #       "value": "本語"
         
     | 
| 
       753 
829 
     | 
    
         
             
            #     },
         
     | 
| 
       754 
830 
     | 
    
         
             
            #     {
         
     | 
| 
       755 
831 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 832 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       756 
833 
     | 
    
         
             
            #       "value": "語!"
         
     | 
| 
       757 
834 
     | 
    
         
             
            #     },
         
     | 
| 
       758 
835 
     | 
    
         
             
            #     {
         
     | 
| 
       759 
836 
     | 
    
         
             
            #       "position": 3,
         
     | 
| 
      
 837 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       760 
838 
     | 
    
         
             
            #       "value": "!!"
         
     | 
| 
       761 
839 
     | 
    
         
             
            #     },
         
     | 
| 
       762 
840 
     | 
    
         
             
            #     {
         
     | 
| 
       763 
841 
     | 
    
         
             
            #       "position": 4,
         
     | 
| 
      
 842 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       764 
843 
     | 
    
         
             
            #       "value": "!!"
         
     | 
| 
       765 
844 
     | 
    
         
             
            #     },
         
     | 
| 
       766 
845 
     | 
    
         
             
            #     {
         
     | 
| 
       767 
846 
     | 
    
         
             
            #       "position": 5,
         
     | 
| 
      
 847 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       768 
848 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       769 
849 
     | 
    
         
             
            #     }
         
     | 
| 
       770 
850 
     | 
    
         
             
            #   ]
         
     | 
| 
         @@ -773,22 +853,22 @@ has symbols and non-ASCII characters.</p> 
     | 
|
| 
       773 
853 
     | 
    
         
             
            </div>
         
     | 
| 
       774 
854 
     | 
    
         
             
            </div>
         
     | 
| 
       775 
855 
     | 
    
         
             
            <div class="section" id="tokenbigramignoreblanksplitsymbolalpha">
         
     | 
| 
       776 
     | 
    
         
            -
            <span id="token-bigram-ignore-blank-split-symbol-alpha"></span><h3>7.8.3.7. < 
     | 
| 
       777 
     | 
    
         
            -
            <p>< 
     | 
| 
       778 
     | 
    
         
            -
            <a class="reference internal" href="#token-bigram">< 
     | 
| 
      
 856 
     | 
    
         
            +
            <span id="token-bigram-ignore-blank-split-symbol-alpha"></span><h3>7.8.3.7. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlpha</span></code><a class="headerlink" href="#tokenbigramignoreblanksplitsymbolalpha" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
      
 857 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlpha</span></code> is similar to
         
     | 
| 
      
 858 
     | 
    
         
            +
            <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a>. The differences between them are the followings:</p>
         
     | 
| 
       779 
859 
     | 
    
         
             
            <blockquote>
         
     | 
| 
       780 
860 
     | 
    
         
             
            <div><ul class="simple">
         
     | 
| 
       781 
861 
     | 
    
         
             
            <li>Blank handling</li>
         
     | 
| 
       782 
862 
     | 
    
         
             
            <li>Symbol and alphabet handling</li>
         
     | 
| 
       783 
863 
     | 
    
         
             
            </ul>
         
     | 
| 
       784 
864 
     | 
    
         
             
            </div></blockquote>
         
     | 
| 
       785 
     | 
    
         
            -
            <p>< 
     | 
| 
      
 865 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlpha</span></code> ignores white-spaces in
         
     | 
| 
       786 
866 
     | 
    
         
             
            continuous symbols and non-ASCII characters.</p>
         
     | 
| 
       787 
     | 
    
         
            -
            <p>< 
     | 
| 
      
 867 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlpha</span></code> tokenizes symbols and
         
     | 
| 
       788 
868 
     | 
    
         
             
            alphabets by bigram tokenize method.</p>
         
     | 
| 
       789 
     | 
    
         
            -
            <p>You can find difference of them by < 
     | 
| 
      
 869 
     | 
    
         
            +
            <p>You can find difference of them by <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">日</span> <span class="pre">本</span> <span class="pre">語</span> <span class="pre">!</span> <span class="pre">!</span> <span class="pre">!</span></code> text because it
         
     | 
| 
       790 
870 
     | 
    
         
             
            has symbols and non-ASCII characters with white spaces and alphabets.</p>
         
     | 
| 
       791 
     | 
    
         
            -
            <p>Here is a result by <a class="reference internal" href="#token-bigram">< 
     | 
| 
      
 871 
     | 
    
         
            +
            <p>Here is a result by <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a> :</p>
         
     | 
| 
       792 
872 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       793 
873 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenBigram "Hello 日 本 語 ! ! !" NormalizerAuto
         
     | 
| 
       794 
874 
     | 
    
         
             
            # [
         
     | 
| 
         @@ -800,37 +880,44 @@ has symbols and non-ASCII characters with white spaces and alphabets.</p> 
     | 
|
| 
       800 
880 
     | 
    
         
             
            #   [
         
     | 
| 
       801 
881 
     | 
    
         
             
            #     {
         
     | 
| 
       802 
882 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 883 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       803 
884 
     | 
    
         
             
            #       "value": "hello"
         
     | 
| 
       804 
885 
     | 
    
         
             
            #     },
         
     | 
| 
       805 
886 
     | 
    
         
             
            #     {
         
     | 
| 
       806 
887 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 888 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       807 
889 
     | 
    
         
             
            #       "value": "日"
         
     | 
| 
       808 
890 
     | 
    
         
             
            #     },
         
     | 
| 
       809 
891 
     | 
    
         
             
            #     {
         
     | 
| 
       810 
892 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 893 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       811 
894 
     | 
    
         
             
            #       "value": "本"
         
     | 
| 
       812 
895 
     | 
    
         
             
            #     },
         
     | 
| 
       813 
896 
     | 
    
         
             
            #     {
         
     | 
| 
       814 
897 
     | 
    
         
             
            #       "position": 3,
         
     | 
| 
      
 898 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       815 
899 
     | 
    
         
             
            #       "value": "語"
         
     | 
| 
       816 
900 
     | 
    
         
             
            #     },
         
     | 
| 
       817 
901 
     | 
    
         
             
            #     {
         
     | 
| 
       818 
902 
     | 
    
         
             
            #       "position": 4,
         
     | 
| 
      
 903 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       819 
904 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       820 
905 
     | 
    
         
             
            #     },
         
     | 
| 
       821 
906 
     | 
    
         
             
            #     {
         
     | 
| 
       822 
907 
     | 
    
         
             
            #       "position": 5,
         
     | 
| 
      
 908 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       823 
909 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       824 
910 
     | 
    
         
             
            #     },
         
     | 
| 
       825 
911 
     | 
    
         
             
            #     {
         
     | 
| 
       826 
912 
     | 
    
         
             
            #       "position": 6,
         
     | 
| 
      
 913 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       827 
914 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       828 
915 
     | 
    
         
             
            #     }
         
     | 
| 
       829 
916 
     | 
    
         
             
            #   ]
         
     | 
| 
       830 
917 
     | 
    
         
             
            # ]
         
     | 
| 
       831 
918 
     | 
    
         
             
            </pre></div>
         
     | 
| 
       832 
919 
     | 
    
         
             
            </div>
         
     | 
| 
       833 
     | 
    
         
            -
            <p>Here is a result by < 
     | 
| 
      
 920 
     | 
    
         
            +
            <p>Here is a result by <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlpha</span></code>:</p>
         
     | 
| 
       834 
921 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       835 
922 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenBigramIgnoreBlankSplitSymbolAlpha "Hello 日 本 語 ! ! !" NormalizerAuto
         
     | 
| 
       836 
923 
     | 
    
         
             
            # [
         
     | 
| 
         @@ -842,46 +929,57 @@ has symbols and non-ASCII characters with white spaces and alphabets.</p> 
     | 
|
| 
       842 
929 
     | 
    
         
             
            #   [
         
     | 
| 
       843 
930 
     | 
    
         
             
            #     {
         
     | 
| 
       844 
931 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 932 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       845 
933 
     | 
    
         
             
            #       "value": "he"
         
     | 
| 
       846 
934 
     | 
    
         
             
            #     },
         
     | 
| 
       847 
935 
     | 
    
         
             
            #     {
         
     | 
| 
       848 
936 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 937 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       849 
938 
     | 
    
         
             
            #       "value": "el"
         
     | 
| 
       850 
939 
     | 
    
         
             
            #     },
         
     | 
| 
       851 
940 
     | 
    
         
             
            #     {
         
     | 
| 
       852 
941 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 942 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       853 
943 
     | 
    
         
             
            #       "value": "ll"
         
     | 
| 
       854 
944 
     | 
    
         
             
            #     },
         
     | 
| 
       855 
945 
     | 
    
         
             
            #     {
         
     | 
| 
       856 
946 
     | 
    
         
             
            #       "position": 3,
         
     | 
| 
      
 947 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       857 
948 
     | 
    
         
             
            #       "value": "lo"
         
     | 
| 
       858 
949 
     | 
    
         
             
            #     },
         
     | 
| 
       859 
950 
     | 
    
         
             
            #     {
         
     | 
| 
       860 
951 
     | 
    
         
             
            #       "position": 4,
         
     | 
| 
      
 952 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       861 
953 
     | 
    
         
             
            #       "value": "o日"
         
     | 
| 
       862 
954 
     | 
    
         
             
            #     },
         
     | 
| 
       863 
955 
     | 
    
         
             
            #     {
         
     | 
| 
       864 
956 
     | 
    
         
             
            #       "position": 5,
         
     | 
| 
      
 957 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       865 
958 
     | 
    
         
             
            #       "value": "日本"
         
     | 
| 
       866 
959 
     | 
    
         
             
            #     },
         
     | 
| 
       867 
960 
     | 
    
         
             
            #     {
         
     | 
| 
       868 
961 
     | 
    
         
             
            #       "position": 6,
         
     | 
| 
      
 962 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       869 
963 
     | 
    
         
             
            #       "value": "本語"
         
     | 
| 
       870 
964 
     | 
    
         
             
            #     },
         
     | 
| 
       871 
965 
     | 
    
         
             
            #     {
         
     | 
| 
       872 
966 
     | 
    
         
             
            #       "position": 7,
         
     | 
| 
      
 967 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       873 
968 
     | 
    
         
             
            #       "value": "語!"
         
     | 
| 
       874 
969 
     | 
    
         
             
            #     },
         
     | 
| 
       875 
970 
     | 
    
         
             
            #     {
         
     | 
| 
       876 
971 
     | 
    
         
             
            #       "position": 8,
         
     | 
| 
      
 972 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       877 
973 
     | 
    
         
             
            #       "value": "!!"
         
     | 
| 
       878 
974 
     | 
    
         
             
            #     },
         
     | 
| 
       879 
975 
     | 
    
         
             
            #     {
         
     | 
| 
       880 
976 
     | 
    
         
             
            #       "position": 9,
         
     | 
| 
      
 977 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       881 
978 
     | 
    
         
             
            #       "value": "!!"
         
     | 
| 
       882 
979 
     | 
    
         
             
            #     },
         
     | 
| 
       883 
980 
     | 
    
         
             
            #     {
         
     | 
| 
       884 
981 
     | 
    
         
             
            #       "position": 10,
         
     | 
| 
      
 982 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       885 
983 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       886 
984 
     | 
    
         
             
            #     }
         
     | 
| 
       887 
985 
     | 
    
         
             
            #   ]
         
     | 
| 
         @@ -890,24 +988,24 @@ has symbols and non-ASCII characters with white spaces and alphabets.</p> 
     | 
|
| 
       890 
988 
     | 
    
         
             
            </div>
         
     | 
| 
       891 
989 
     | 
    
         
             
            </div>
         
     | 
| 
       892 
990 
     | 
    
         
             
            <div class="section" id="tokenbigramignoreblanksplitsymbolalphadigit">
         
     | 
| 
       893 
     | 
    
         
            -
            <span id="token-bigram-ignore-blank-split-symbol-alpha-digit"></span><h3>7.8.3.8. < 
     | 
| 
       894 
     | 
    
         
            -
            <p>< 
     | 
| 
       895 
     | 
    
         
            -
            <a class="reference internal" href="#token-bigram">< 
     | 
| 
      
 991 
     | 
    
         
            +
            <span id="token-bigram-ignore-blank-split-symbol-alpha-digit"></span><h3>7.8.3.8. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlphaDigit</span></code><a class="headerlink" href="#tokenbigramignoreblanksplitsymbolalphadigit" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
      
 992 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlphaDigit</span></code> is similar to
         
     | 
| 
      
 993 
     | 
    
         
            +
            <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a>. The differences between them are the followings:</p>
         
     | 
| 
       896 
994 
     | 
    
         
             
            <blockquote>
         
     | 
| 
       897 
995 
     | 
    
         
             
            <div><ul class="simple">
         
     | 
| 
       898 
996 
     | 
    
         
             
            <li>Blank handling</li>
         
     | 
| 
       899 
997 
     | 
    
         
             
            <li>Symbol, alphabet and digit handling</li>
         
     | 
| 
       900 
998 
     | 
    
         
             
            </ul>
         
     | 
| 
       901 
999 
     | 
    
         
             
            </div></blockquote>
         
     | 
| 
       902 
     | 
    
         
            -
            <p>< 
     | 
| 
      
 1000 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlphaDigit</span></code> ignores white-spaces
         
     | 
| 
       903 
1001 
     | 
    
         
             
            in continuous symbols and non-ASCII characters.</p>
         
     | 
| 
       904 
     | 
    
         
            -
            <p>< 
     | 
| 
      
 1002 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlphaDigit</span></code> tokenizes symbols,
         
     | 
| 
       905 
1003 
     | 
    
         
             
            alphabets and digits by bigram tokenize method. It means that all
         
     | 
| 
       906 
1004 
     | 
    
         
             
            characters are tokenized by bigram tokenize method.</p>
         
     | 
| 
       907 
     | 
    
         
            -
            <p>You can find difference of them by < 
     | 
| 
      
 1005 
     | 
    
         
            +
            <p>You can find difference of them by <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">日</span> <span class="pre">本</span> <span class="pre">語</span> <span class="pre">!</span> <span class="pre">!</span> <span class="pre">!</span> <span class="pre">777</span></code> text
         
     | 
| 
       908 
1006 
     | 
    
         
             
            because it has symbols and non-ASCII characters with white spaces,
         
     | 
| 
       909 
1007 
     | 
    
         
             
            alphabets and digits.</p>
         
     | 
| 
       910 
     | 
    
         
            -
            <p>Here is a result by <a class="reference internal" href="#token-bigram">< 
     | 
| 
      
 1008 
     | 
    
         
            +
            <p>Here is a result by <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a> :</p>
         
     | 
| 
       911 
1009 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       912 
1010 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenBigram "Hello 日 本 語 ! ! ! 777" NormalizerAuto
         
     | 
| 
       913 
1011 
     | 
    
         
             
            # [
         
     | 
| 
         @@ -919,41 +1017,49 @@ alphabets and digits.</p> 
     | 
|
| 
       919 
1017 
     | 
    
         
             
            #   [
         
     | 
| 
       920 
1018 
     | 
    
         
             
            #     {
         
     | 
| 
       921 
1019 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 1020 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       922 
1021 
     | 
    
         
             
            #       "value": "hello"
         
     | 
| 
       923 
1022 
     | 
    
         
             
            #     },
         
     | 
| 
       924 
1023 
     | 
    
         
             
            #     {
         
     | 
| 
       925 
1024 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 1025 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       926 
1026 
     | 
    
         
             
            #       "value": "日"
         
     | 
| 
       927 
1027 
     | 
    
         
             
            #     },
         
     | 
| 
       928 
1028 
     | 
    
         
             
            #     {
         
     | 
| 
       929 
1029 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 1030 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       930 
1031 
     | 
    
         
             
            #       "value": "本"
         
     | 
| 
       931 
1032 
     | 
    
         
             
            #     },
         
     | 
| 
       932 
1033 
     | 
    
         
             
            #     {
         
     | 
| 
       933 
1034 
     | 
    
         
             
            #       "position": 3,
         
     | 
| 
      
 1035 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       934 
1036 
     | 
    
         
             
            #       "value": "語"
         
     | 
| 
       935 
1037 
     | 
    
         
             
            #     },
         
     | 
| 
       936 
1038 
     | 
    
         
             
            #     {
         
     | 
| 
       937 
1039 
     | 
    
         
             
            #       "position": 4,
         
     | 
| 
      
 1040 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       938 
1041 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       939 
1042 
     | 
    
         
             
            #     },
         
     | 
| 
       940 
1043 
     | 
    
         
             
            #     {
         
     | 
| 
       941 
1044 
     | 
    
         
             
            #       "position": 5,
         
     | 
| 
      
 1045 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       942 
1046 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       943 
1047 
     | 
    
         
             
            #     },
         
     | 
| 
       944 
1048 
     | 
    
         
             
            #     {
         
     | 
| 
       945 
1049 
     | 
    
         
             
            #       "position": 6,
         
     | 
| 
      
 1050 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       946 
1051 
     | 
    
         
             
            #       "value": "!"
         
     | 
| 
       947 
1052 
     | 
    
         
             
            #     },
         
     | 
| 
       948 
1053 
     | 
    
         
             
            #     {
         
     | 
| 
       949 
1054 
     | 
    
         
             
            #       "position": 7,
         
     | 
| 
      
 1055 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       950 
1056 
     | 
    
         
             
            #       "value": "777"
         
     | 
| 
       951 
1057 
     | 
    
         
             
            #     }
         
     | 
| 
       952 
1058 
     | 
    
         
             
            #   ]
         
     | 
| 
       953 
1059 
     | 
    
         
             
            # ]
         
     | 
| 
       954 
1060 
     | 
    
         
             
            </pre></div>
         
     | 
| 
       955 
1061 
     | 
    
         
             
            </div>
         
     | 
| 
       956 
     | 
    
         
            -
            <p>Here is a result by < 
     | 
| 
      
 1062 
     | 
    
         
            +
            <p>Here is a result by <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlphaDigit</span></code>:</p>
         
     | 
| 
       957 
1063 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       958 
1064 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenBigramIgnoreBlankSplitSymbolAlphaDigit "Hello 日 本 語 ! ! ! 777" NormalizerAuto
         
     | 
| 
       959 
1065 
     | 
    
         
             
            # [
         
     | 
| 
         @@ -965,58 +1071,72 @@ alphabets and digits.</p> 
     | 
|
| 
       965 
1071 
     | 
    
         
             
            #   [
         
     | 
| 
       966 
1072 
     | 
    
         
             
            #     {
         
     | 
| 
       967 
1073 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 1074 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       968 
1075 
     | 
    
         
             
            #       "value": "he"
         
     | 
| 
       969 
1076 
     | 
    
         
             
            #     },
         
     | 
| 
       970 
1077 
     | 
    
         
             
            #     {
         
     | 
| 
       971 
1078 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 1079 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       972 
1080 
     | 
    
         
             
            #       "value": "el"
         
     | 
| 
       973 
1081 
     | 
    
         
             
            #     },
         
     | 
| 
       974 
1082 
     | 
    
         
             
            #     {
         
     | 
| 
       975 
1083 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 1084 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       976 
1085 
     | 
    
         
             
            #       "value": "ll"
         
     | 
| 
       977 
1086 
     | 
    
         
             
            #     },
         
     | 
| 
       978 
1087 
     | 
    
         
             
            #     {
         
     | 
| 
       979 
1088 
     | 
    
         
             
            #       "position": 3,
         
     | 
| 
      
 1089 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       980 
1090 
     | 
    
         
             
            #       "value": "lo"
         
     | 
| 
       981 
1091 
     | 
    
         
             
            #     },
         
     | 
| 
       982 
1092 
     | 
    
         
             
            #     {
         
     | 
| 
       983 
1093 
     | 
    
         
             
            #       "position": 4,
         
     | 
| 
      
 1094 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       984 
1095 
     | 
    
         
             
            #       "value": "o日"
         
     | 
| 
       985 
1096 
     | 
    
         
             
            #     },
         
     | 
| 
       986 
1097 
     | 
    
         
             
            #     {
         
     | 
| 
       987 
1098 
     | 
    
         
             
            #       "position": 5,
         
     | 
| 
      
 1099 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       988 
1100 
     | 
    
         
             
            #       "value": "日本"
         
     | 
| 
       989 
1101 
     | 
    
         
             
            #     },
         
     | 
| 
       990 
1102 
     | 
    
         
             
            #     {
         
     | 
| 
       991 
1103 
     | 
    
         
             
            #       "position": 6,
         
     | 
| 
      
 1104 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       992 
1105 
     | 
    
         
             
            #       "value": "本語"
         
     | 
| 
       993 
1106 
     | 
    
         
             
            #     },
         
     | 
| 
       994 
1107 
     | 
    
         
             
            #     {
         
     | 
| 
       995 
1108 
     | 
    
         
             
            #       "position": 7,
         
     | 
| 
      
 1109 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       996 
1110 
     | 
    
         
             
            #       "value": "語!"
         
     | 
| 
       997 
1111 
     | 
    
         
             
            #     },
         
     | 
| 
       998 
1112 
     | 
    
         
             
            #     {
         
     | 
| 
       999 
1113 
     | 
    
         
             
            #       "position": 8,
         
     | 
| 
      
 1114 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1000 
1115 
     | 
    
         
             
            #       "value": "!!"
         
     | 
| 
       1001 
1116 
     | 
    
         
             
            #     },
         
     | 
| 
       1002 
1117 
     | 
    
         
             
            #     {
         
     | 
| 
       1003 
1118 
     | 
    
         
             
            #       "position": 9,
         
     | 
| 
      
 1119 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1004 
1120 
     | 
    
         
             
            #       "value": "!!"
         
     | 
| 
       1005 
1121 
     | 
    
         
             
            #     },
         
     | 
| 
       1006 
1122 
     | 
    
         
             
            #     {
         
     | 
| 
       1007 
1123 
     | 
    
         
             
            #       "position": 10,
         
     | 
| 
      
 1124 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1008 
1125 
     | 
    
         
             
            #       "value": "!7"
         
     | 
| 
       1009 
1126 
     | 
    
         
             
            #     },
         
     | 
| 
       1010 
1127 
     | 
    
         
             
            #     {
         
     | 
| 
       1011 
1128 
     | 
    
         
             
            #       "position": 11,
         
     | 
| 
      
 1129 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1012 
1130 
     | 
    
         
             
            #       "value": "77"
         
     | 
| 
       1013 
1131 
     | 
    
         
             
            #     },
         
     | 
| 
       1014 
1132 
     | 
    
         
             
            #     {
         
     | 
| 
       1015 
1133 
     | 
    
         
             
            #       "position": 12,
         
     | 
| 
      
 1134 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1016 
1135 
     | 
    
         
             
            #       "value": "77"
         
     | 
| 
       1017 
1136 
     | 
    
         
             
            #     },
         
     | 
| 
       1018 
1137 
     | 
    
         
             
            #     {
         
     | 
| 
       1019 
1138 
     | 
    
         
             
            #       "position": 13,
         
     | 
| 
      
 1139 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1020 
1140 
     | 
    
         
             
            #       "value": "7"
         
     | 
| 
       1021 
1141 
     | 
    
         
             
            #     }
         
     | 
| 
       1022 
1142 
     | 
    
         
             
            #   ]
         
     | 
| 
         @@ -1025,10 +1145,10 @@ alphabets and digits.</p> 
     | 
|
| 
       1025 
1145 
     | 
    
         
             
            </div>
         
     | 
| 
       1026 
1146 
     | 
    
         
             
            </div>
         
     | 
| 
       1027 
1147 
     | 
    
         
             
            <div class="section" id="tokenunigram">
         
     | 
| 
       1028 
     | 
    
         
            -
            <span id="token-unigram"></span><h3>7.8.3.9. < 
     | 
| 
       1029 
     | 
    
         
            -
            <p>< 
     | 
| 
       1030 
     | 
    
         
            -
            between them is token unit. <a class="reference internal" href="#token-bigram">< 
     | 
| 
       1031 
     | 
    
         
            -
            token. < 
     | 
| 
      
 1148 
     | 
    
         
            +
            <span id="token-unigram"></span><h3>7.8.3.9. <code class="docutils literal"><span class="pre">TokenUnigram</span></code><a class="headerlink" href="#tokenunigram" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
      
 1149 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenUnigram</span></code> is similar to <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a>. The differences
         
     | 
| 
      
 1150 
     | 
    
         
            +
            between them is token unit. <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a> uses 2 characters per
         
     | 
| 
      
 1151 
     | 
    
         
            +
            token. <code class="docutils literal"><span class="pre">TokenUnigram</span></code> uses 1 character per token.</p>
         
     | 
| 
       1032 
1152 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       1033 
1153 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenUnigram "100cents!!!" NormalizerAuto
         
     | 
| 
       1034 
1154 
     | 
    
         
             
            # [
         
     | 
| 
         @@ -1040,14 +1160,17 @@ token. <tt class="docutils literal"><span class="pre">TokenUnigram</span></tt> u 
     | 
|
| 
       1040 
1160 
     | 
    
         
             
            #   [
         
     | 
| 
       1041 
1161 
     | 
    
         
             
            #     {
         
     | 
| 
       1042 
1162 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 1163 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1043 
1164 
     | 
    
         
             
            #       "value": "100"
         
     | 
| 
       1044 
1165 
     | 
    
         
             
            #     },
         
     | 
| 
       1045 
1166 
     | 
    
         
             
            #     {
         
     | 
| 
       1046 
1167 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 1168 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1047 
1169 
     | 
    
         
             
            #       "value": "cents"
         
     | 
| 
       1048 
1170 
     | 
    
         
             
            #     },
         
     | 
| 
       1049 
1171 
     | 
    
         
             
            #     {
         
     | 
| 
       1050 
1172 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 1173 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1051 
1174 
     | 
    
         
             
            #       "value": "!!!"
         
     | 
| 
       1052 
1175 
     | 
    
         
             
            #     }
         
     | 
| 
       1053 
1176 
     | 
    
         
             
            #   ]
         
     | 
| 
         @@ -1056,10 +1179,10 @@ token. <tt class="docutils literal"><span class="pre">TokenUnigram</span></tt> u 
     | 
|
| 
       1056 
1179 
     | 
    
         
             
            </div>
         
     | 
| 
       1057 
1180 
     | 
    
         
             
            </div>
         
     | 
| 
       1058 
1181 
     | 
    
         
             
            <div class="section" id="tokentrigram">
         
     | 
| 
       1059 
     | 
    
         
            -
            <span id="token-trigram"></span><h3>7.8.3.10. < 
     | 
| 
       1060 
     | 
    
         
            -
            <p>< 
     | 
| 
       1061 
     | 
    
         
            -
            between them is token unit. <a class="reference internal" href="#token-bigram">< 
     | 
| 
       1062 
     | 
    
         
            -
            token. < 
     | 
| 
      
 1182 
     | 
    
         
            +
            <span id="token-trigram"></span><h3>7.8.3.10. <code class="docutils literal"><span class="pre">TokenTrigram</span></code><a class="headerlink" href="#tokentrigram" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
      
 1183 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenTrigram</span></code> is similar to <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a>. The differences
         
     | 
| 
      
 1184 
     | 
    
         
            +
            between them is token unit. <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a> uses 2 characters per
         
     | 
| 
      
 1185 
     | 
    
         
            +
            token. <code class="docutils literal"><span class="pre">TokenTrigram</span></code> uses 3 characters per token.</p>
         
     | 
| 
       1063 
1186 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       1064 
1187 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenTrigram "10000cents!!!!!" NormalizerAuto
         
     | 
| 
       1065 
1188 
     | 
    
         
             
            # [
         
     | 
| 
         @@ -1071,14 +1194,17 @@ token. <tt class="docutils literal"><span class="pre">TokenTrigram</span></tt> u 
     | 
|
| 
       1071 
1194 
     | 
    
         
             
            #   [
         
     | 
| 
       1072 
1195 
     | 
    
         
             
            #     {
         
     | 
| 
       1073 
1196 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 1197 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1074 
1198 
     | 
    
         
             
            #       "value": "10000"
         
     | 
| 
       1075 
1199 
     | 
    
         
             
            #     },
         
     | 
| 
       1076 
1200 
     | 
    
         
             
            #     {
         
     | 
| 
       1077 
1201 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 1202 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1078 
1203 
     | 
    
         
             
            #       "value": "cents"
         
     | 
| 
       1079 
1204 
     | 
    
         
             
            #     },
         
     | 
| 
       1080 
1205 
     | 
    
         
             
            #     {
         
     | 
| 
       1081 
1206 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 1207 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1082 
1208 
     | 
    
         
             
            #       "value": "!!!!!"
         
     | 
| 
       1083 
1209 
     | 
    
         
             
            #     }
         
     | 
| 
       1084 
1210 
     | 
    
         
             
            #   ]
         
     | 
| 
         @@ -1087,14 +1213,14 @@ token. <tt class="docutils literal"><span class="pre">TokenTrigram</span></tt> u 
     | 
|
| 
       1087 
1213 
     | 
    
         
             
            </div>
         
     | 
| 
       1088 
1214 
     | 
    
         
             
            </div>
         
     | 
| 
       1089 
1215 
     | 
    
         
             
            <div class="section" id="tokendelimit">
         
     | 
| 
       1090 
     | 
    
         
            -
            <span id="token-delimit"></span><h3>7.8.3.11. < 
     | 
| 
       1091 
     | 
    
         
            -
            <p>< 
     | 
| 
       1092 
     | 
    
         
            -
            characters (< 
     | 
| 
       1093 
     | 
    
         
            -
            < 
     | 
| 
       1094 
     | 
    
         
            -
            <p>< 
     | 
| 
       1095 
     | 
    
         
            -
            and < 
     | 
| 
       1096 
     | 
    
         
            -
            <span class="pre">full-text-search</span> <span class="pre">http</span></ 
     | 
| 
       1097 
     | 
    
         
            -
            <p>Here is an example of < 
     | 
| 
      
 1216 
     | 
    
         
            +
            <span id="token-delimit"></span><h3>7.8.3.11. <code class="docutils literal"><span class="pre">TokenDelimit</span></code><a class="headerlink" href="#tokendelimit" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
      
 1217 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenDelimit</span></code> extracts token by splitting one or more space
         
     | 
| 
      
 1218 
     | 
    
         
            +
            characters (<code class="docutils literal"><span class="pre">U+0020</span></code>). For example, <code class="docutils literal"><span class="pre">Hello</span> <span class="pre">World</span></code> is tokenized to
         
     | 
| 
      
 1219 
     | 
    
         
            +
            <code class="docutils literal"><span class="pre">Hello</span></code> and <code class="docutils literal"><span class="pre">World</span></code>.</p>
         
     | 
| 
      
 1220 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenDelimit</span></code> is suitable for tag text. You can extract <code class="docutils literal"><span class="pre">groonga</span></code>
         
     | 
| 
      
 1221 
     | 
    
         
            +
            and <code class="docutils literal"><span class="pre">full-text-search</span></code> and <code class="docutils literal"><span class="pre">http</span></code> as tags from <code class="docutils literal"><span class="pre">groonga</span>
         
     | 
| 
      
 1222 
     | 
    
         
            +
            <span class="pre">full-text-search</span> <span class="pre">http</span></code>.</p>
         
     | 
| 
      
 1223 
     | 
    
         
            +
            <p>Here is an example of <code class="docutils literal"><span class="pre">TokenDelimit</span></code>:</p>
         
     | 
| 
       1098 
1224 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       1099 
1225 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenDelimit "Groonga full-text-search HTTP" NormalizerAuto
         
     | 
| 
       1100 
1226 
     | 
    
         
             
            # [
         
     | 
| 
         @@ -1106,14 +1232,17 @@ and <tt class="docutils literal"><span class="pre">full-text-search</span></tt> 
     | 
|
| 
       1106 
1232 
     | 
    
         
             
            #   [
         
     | 
| 
       1107 
1233 
     | 
    
         
             
            #     {
         
     | 
| 
       1108 
1234 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 1235 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1109 
1236 
     | 
    
         
             
            #       "value": "groonga"
         
     | 
| 
       1110 
1237 
     | 
    
         
             
            #     },
         
     | 
| 
       1111 
1238 
     | 
    
         
             
            #     {
         
     | 
| 
       1112 
1239 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 1240 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1113 
1241 
     | 
    
         
             
            #       "value": "full-text-search"
         
     | 
| 
       1114 
1242 
     | 
    
         
             
            #     },
         
     | 
| 
       1115 
1243 
     | 
    
         
             
            #     {
         
     | 
| 
       1116 
1244 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 1245 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1117 
1246 
     | 
    
         
             
            #       "value": "http"
         
     | 
| 
       1118 
1247 
     | 
    
         
             
            #     }
         
     | 
| 
       1119 
1248 
     | 
    
         
             
            #   ]
         
     | 
| 
         @@ -1122,13 +1251,13 @@ and <tt class="docutils literal"><span class="pre">full-text-search</span></tt> 
     | 
|
| 
       1122 
1251 
     | 
    
         
             
            </div>
         
     | 
| 
       1123 
1252 
     | 
    
         
             
            </div>
         
     | 
| 
       1124 
1253 
     | 
    
         
             
            <div class="section" id="tokendelimitnull">
         
     | 
| 
       1125 
     | 
    
         
            -
            <span id="token-delimit-null"></span><h3>7.8.3.12. < 
     | 
| 
       1126 
     | 
    
         
            -
            <p>< 
     | 
| 
       1127 
     | 
    
         
            -
            difference between them is separator character. <a class="reference internal" href="#token-delimit">< 
     | 
| 
       1128 
     | 
    
         
            -
            uses space character (< 
     | 
| 
       1129 
     | 
    
         
            -
            character (< 
     | 
| 
       1130 
     | 
    
         
            -
            <p>< 
     | 
| 
       1131 
     | 
    
         
            -
            <p>Here is an example of < 
     | 
| 
      
 1254 
     | 
    
         
            +
            <span id="token-delimit-null"></span><h3>7.8.3.12. <code class="docutils literal"><span class="pre">TokenDelimitNull</span></code><a class="headerlink" href="#tokendelimitnull" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
      
 1255 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenDelimitNull</span></code> is similar to <a class="reference internal" href="#token-delimit"><span>TokenDelimit</span></a>. The
         
     | 
| 
      
 1256 
     | 
    
         
            +
            difference between them is separator character. <a class="reference internal" href="#token-delimit"><span>TokenDelimit</span></a>
         
     | 
| 
      
 1257 
     | 
    
         
            +
            uses space character (<code class="docutils literal"><span class="pre">U+0020</span></code>) but <code class="docutils literal"><span class="pre">TokenDelimitNull</span></code> uses NUL
         
     | 
| 
      
 1258 
     | 
    
         
            +
            character (<code class="docutils literal"><span class="pre">U+0000</span></code>).</p>
         
     | 
| 
      
 1259 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenDelimitNull</span></code> is also suitable for tag text.</p>
         
     | 
| 
      
 1260 
     | 
    
         
            +
            <p>Here is an example of <code class="docutils literal"><span class="pre">TokenDelimitNull</span></code>:</p>
         
     | 
| 
       1132 
1261 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       1133 
1262 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenDelimitNull "Groonga\u0000full-text-search\u0000HTTP" NormalizerAuto
         
     | 
| 
       1134 
1263 
     | 
    
         
             
            # [
         
     | 
| 
         @@ -1140,6 +1269,7 @@ character (<tt class="docutils literal"><span class="pre">U+0000</span></tt>).</ 
     | 
|
| 
       1140 
1269 
     | 
    
         
             
            #   [
         
     | 
| 
       1141 
1270 
     | 
    
         
             
            #     {
         
     | 
| 
       1142 
1271 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 1272 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1143 
1273 
     | 
    
         
             
            #       "value": "groongau0000full-text-searchu0000http"
         
     | 
| 
       1144 
1274 
     | 
    
         
             
            #     }
         
     | 
| 
       1145 
1275 
     | 
    
         
             
            #   ]
         
     | 
| 
         @@ -1148,23 +1278,23 @@ character (<tt class="docutils literal"><span class="pre">U+0000</span></tt>).</ 
     | 
|
| 
       1148 
1278 
     | 
    
         
             
            </div>
         
     | 
| 
       1149 
1279 
     | 
    
         
             
            </div>
         
     | 
| 
       1150 
1280 
     | 
    
         
             
            <div class="section" id="tokenmecab">
         
     | 
| 
       1151 
     | 
    
         
            -
            <span id="token-mecab"></span><h3>7.8.3.13. < 
     | 
| 
       1152 
     | 
    
         
            -
            <p>< 
     | 
| 
      
 1281 
     | 
    
         
            +
            <span id="token-mecab"></span><h3>7.8.3.13. <code class="docutils literal"><span class="pre">TokenMecab</span></code><a class="headerlink" href="#tokenmecab" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
      
 1282 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenMecab</span></code> is a tokenizer based on <a class="reference external" href="http://mecab.sourceforge.net/">MeCab</a> part-of-speech and
         
     | 
| 
       1153 
1283 
     | 
    
         
             
            morphological analyzer.</p>
         
     | 
| 
       1154 
1284 
     | 
    
         
             
            <p>MeCab doesn't depend on Japanese. You can use MeCab for other
         
     | 
| 
       1155 
1285 
     | 
    
         
             
            languages by creating dictionary for the languages. You can use <a class="reference external" href="http://osdn.jp/projects/naist-jdic/">NAIST
         
     | 
| 
       1156 
1286 
     | 
    
         
             
            Japanese Dictionary</a>
         
     | 
| 
       1157 
1287 
     | 
    
         
             
            for Japanese.</p>
         
     | 
| 
       1158 
     | 
    
         
            -
            <p>< 
     | 
| 
       1159 
     | 
    
         
            -
            < 
     | 
| 
       1160 
     | 
    
         
            -
            <a class="reference internal" href="#token-bigram">< 
     | 
| 
       1161 
     | 
    
         
            -
            < 
     | 
| 
      
 1288 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenMecab</span></code> is good for precision rather than recall. You can find
         
     | 
| 
      
 1289 
     | 
    
         
            +
            <code class="docutils literal"><span class="pre">東京都</span></code> and <code class="docutils literal"><span class="pre">京都</span></code> texts by <code class="docutils literal"><span class="pre">京都</span></code> query with
         
     | 
| 
      
 1290 
     | 
    
         
            +
            <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a> but <code class="docutils literal"><span class="pre">東京都</span></code> isn't expected. You can find only
         
     | 
| 
      
 1291 
     | 
    
         
            +
            <code class="docutils literal"><span class="pre">京都</span></code> text by <code class="docutils literal"><span class="pre">京都</span></code> query with <code class="docutils literal"><span class="pre">TokenMecab</span></code>.</p>
         
     | 
| 
       1162 
1292 
     | 
    
         
             
            <p>If you want to support neologisms, you need to keep updating your
         
     | 
| 
       1163 
     | 
    
         
            -
            MeCab dictionary. It needs maintain cost. (<a class="reference internal" href="#token-bigram">< 
     | 
| 
       1164 
     | 
    
         
            -
            require dictionary maintenance because <a class="reference internal" href="#token-bigram">< 
     | 
| 
      
 1293 
     | 
    
         
            +
            MeCab dictionary. It needs maintain cost. (<a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a> doesn't
         
     | 
| 
      
 1294 
     | 
    
         
            +
            require dictionary maintenance because <a class="reference internal" href="#token-bigram"><span>TokenBigram</span></a> doesn't use
         
     | 
| 
       1165 
1295 
     | 
    
         
             
            dictionary.) <a class="reference external" href="https://github.com/neologd/mecab-ipadic-neologd">mecab-ipadic-NEologd : Neologism dictionary for MeCab</a> may help you.</p>
         
     | 
| 
       1166 
     | 
    
         
            -
            <p>Here is an example of < 
     | 
| 
       1167 
     | 
    
         
            -
            and < 
     | 
| 
      
 1296 
     | 
    
         
            +
            <p>Here is an example of <code class="docutils literal"><span class="pre">TokenMeCab</span></code>. <code class="docutils literal"><span class="pre">東京都</span></code> is tokenized to <code class="docutils literal"><span class="pre">東京</span></code>
         
     | 
| 
      
 1297 
     | 
    
         
            +
            and <code class="docutils literal"><span class="pre">都</span></code>. They don't include <code class="docutils literal"><span class="pre">京都</span></code>:</p>
         
     | 
| 
       1168 
1298 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       1169 
1299 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenMecab "東京都"
         
     | 
| 
       1170 
1300 
     | 
    
         
             
            # [
         
     | 
| 
         @@ -1176,10 +1306,12 @@ and <tt class="docutils literal"><span class="pre">都</span></tt>. They don't i 
     | 
|
| 
       1176 
1306 
     | 
    
         
             
            #   [
         
     | 
| 
       1177 
1307 
     | 
    
         
             
            #     {
         
     | 
| 
       1178 
1308 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 1309 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1179 
1310 
     | 
    
         
             
            #       "value": "東京"
         
     | 
| 
       1180 
1311 
     | 
    
         
             
            #     },
         
     | 
| 
       1181 
1312 
     | 
    
         
             
            #     {
         
     | 
| 
       1182 
1313 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 1314 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1183 
1315 
     | 
    
         
             
            #       "value": "都"
         
     | 
| 
       1184 
1316 
     | 
    
         
             
            #     }
         
     | 
| 
       1185 
1317 
     | 
    
         
             
            #   ]
         
     | 
| 
         @@ -1188,7 +1320,7 @@ and <tt class="docutils literal"><span class="pre">都</span></tt>. They don't i 
     | 
|
| 
       1188 
1320 
     | 
    
         
             
            </div>
         
     | 
| 
       1189 
1321 
     | 
    
         
             
            </div>
         
     | 
| 
       1190 
1322 
     | 
    
         
             
            <div class="section" id="tokenregexp">
         
     | 
| 
       1191 
     | 
    
         
            -
            <span id="token-regexp"></span><h3>7.8.3.14. < 
     | 
| 
      
 1323 
     | 
    
         
            +
            <span id="token-regexp"></span><h3>7.8.3.14. <code class="docutils literal"><span class="pre">TokenRegexp</span></code><a class="headerlink" href="#tokenregexp" title="Permalink to this headline">¶</a></h3>
         
     | 
| 
       1192 
1324 
     | 
    
         
             
            <div class="versionadded">
         
     | 
| 
       1193 
1325 
     | 
    
         
             
            <p><span class="versionmodified">New in version 5.0.1.</span></p>
         
     | 
| 
       1194 
1326 
     | 
    
         
             
            </div>
         
     | 
| 
         @@ -1201,21 +1333,21 @@ and <tt class="docutils literal"><span class="pre">都</span></tt>. They don't i 
     | 
|
| 
       1201 
1333 
     | 
    
         
             
            <p class="last">This tokenizer can be used only with UTF-8. You can't use this
         
     | 
| 
       1202 
1334 
     | 
    
         
             
            tokenizer with EUC-JP, Shift_JIS and so on.</p>
         
     | 
| 
       1203 
1335 
     | 
    
         
             
            </div>
         
     | 
| 
       1204 
     | 
    
         
            -
            <p>< 
     | 
| 
      
 1336 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenRegexp</span></code> is a tokenizer for supporting regular expression
         
     | 
| 
       1205 
1337 
     | 
    
         
             
            search by index.</p>
         
     | 
| 
       1206 
1338 
     | 
    
         
             
            <p>In general, regular expression search is evaluated as sequential
         
     | 
| 
       1207 
1339 
     | 
    
         
             
            search. But the following cases can be evaluated as index search:</p>
         
     | 
| 
       1208 
1340 
     | 
    
         
             
            <blockquote>
         
     | 
| 
       1209 
1341 
     | 
    
         
             
            <div><ul class="simple">
         
     | 
| 
       1210 
     | 
    
         
            -
            <li>Literal only case such as < 
     | 
| 
       1211 
     | 
    
         
            -
            <li>The beginning of text and literal case such as < 
     | 
| 
       1212 
     | 
    
         
            -
            <li>The end of text and literal case such as < 
     | 
| 
      
 1342 
     | 
    
         
            +
            <li>Literal only case such as <code class="docutils literal"><span class="pre">hello</span></code></li>
         
     | 
| 
      
 1343 
     | 
    
         
            +
            <li>The beginning of text and literal case such as <code class="docutils literal"><span class="pre">\A/home/alice</span></code></li>
         
     | 
| 
      
 1344 
     | 
    
         
            +
            <li>The end of text and literal case such as <code class="docutils literal"><span class="pre">\.txt\z</span></code></li>
         
     | 
| 
       1213 
1345 
     | 
    
         
             
            </ul>
         
     | 
| 
       1214 
1346 
     | 
    
         
             
            </div></blockquote>
         
     | 
| 
       1215 
1347 
     | 
    
         
             
            <p>In most cases, index search is faster than sequential search.</p>
         
     | 
| 
       1216 
     | 
    
         
            -
            <p>< 
     | 
| 
       1217 
     | 
    
         
            -
            adds the beginning of text mark (< 
     | 
| 
       1218 
     | 
    
         
            -
            and the end of text mark (< 
     | 
| 
      
 1348 
     | 
    
         
            +
            <p><code class="docutils literal"><span class="pre">TokenRegexp</span></code> is based on bigram tokenize method. <code class="docutils literal"><span class="pre">TokenRegexp</span></code>
         
     | 
| 
      
 1349 
     | 
    
         
            +
            adds the beginning of text mark (<code class="docutils literal"><span class="pre">U+FFEF</span></code>) at the begging of text
         
     | 
| 
      
 1350 
     | 
    
         
            +
            and the end of text mark (<code class="docutils literal"><span class="pre">U+FFF0</span></code>) to the end of text when you
         
     | 
| 
       1219 
1351 
     | 
    
         
             
            index text:</p>
         
     | 
| 
       1220 
1352 
     | 
    
         
             
            <p>Execution example:</p>
         
     | 
| 
       1221 
1353 
     | 
    
         
             
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenRegexp "/home/alice/test.txt" NormalizerAuto --mode ADD
         
     | 
| 
         @@ -1228,194 +1360,112 @@ index text:</p> 
     | 
|
| 
       1228 
1360 
     | 
    
         
             
            #   [
         
     | 
| 
       1229 
1361 
     | 
    
         
             
            #     {
         
     | 
| 
       1230 
1362 
     | 
    
         
             
            #       "position": 0,
         
     | 
| 
      
 1363 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1231 
1364 
     | 
    
         
             
            #       "value": ""
         
     | 
| 
       1232 
1365 
     | 
    
         
             
            #     },
         
     | 
| 
       1233 
1366 
     | 
    
         
             
            #     {
         
     | 
| 
       1234 
1367 
     | 
    
         
             
            #       "position": 1,
         
     | 
| 
      
 1368 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1235 
1369 
     | 
    
         
             
            #       "value": "/h"
         
     | 
| 
       1236 
1370 
     | 
    
         
             
            #     },
         
     | 
| 
       1237 
1371 
     | 
    
         
             
            #     {
         
     | 
| 
       1238 
1372 
     | 
    
         
             
            #       "position": 2,
         
     | 
| 
      
 1373 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1239 
1374 
     | 
    
         
             
            #       "value": "ho"
         
     | 
| 
       1240 
1375 
     | 
    
         
             
            #     },
         
     | 
| 
       1241 
1376 
     | 
    
         
             
            #     {
         
     | 
| 
       1242 
1377 
     | 
    
         
             
            #       "position": 3,
         
     | 
| 
      
 1378 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1243 
1379 
     | 
    
         
             
            #       "value": "om"
         
     | 
| 
       1244 
1380 
     | 
    
         
             
            #     },
         
     | 
| 
       1245 
1381 
     | 
    
         
             
            #     {
         
     | 
| 
       1246 
1382 
     | 
    
         
             
            #       "position": 4,
         
     | 
| 
      
 1383 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1247 
1384 
     | 
    
         
             
            #       "value": "me"
         
     | 
| 
       1248 
1385 
     | 
    
         
             
            #     },
         
     | 
| 
       1249 
1386 
     | 
    
         
             
            #     {
         
     | 
| 
       1250 
1387 
     | 
    
         
             
            #       "position": 5,
         
     | 
| 
      
 1388 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1251 
1389 
     | 
    
         
             
            #       "value": "e/"
         
     | 
| 
       1252 
1390 
     | 
    
         
             
            #     },
         
     | 
| 
       1253 
1391 
     | 
    
         
             
            #     {
         
     | 
| 
       1254 
1392 
     | 
    
         
             
            #       "position": 6,
         
     | 
| 
      
 1393 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1255 
1394 
     | 
    
         
             
            #       "value": "/a"
         
     | 
| 
       1256 
1395 
     | 
    
         
             
            #     },
         
     | 
| 
       1257 
1396 
     | 
    
         
             
            #     {
         
     | 
| 
       1258 
1397 
     | 
    
         
             
            #       "position": 7,
         
     | 
| 
      
 1398 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1259 
1399 
     | 
    
         
             
            #       "value": "al"
         
     | 
| 
       1260 
1400 
     | 
    
         
             
            #     },
         
     | 
| 
       1261 
1401 
     | 
    
         
             
            #     {
         
     | 
| 
       1262 
1402 
     | 
    
         
             
            #       "position": 8,
         
     | 
| 
      
 1403 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1263 
1404 
     | 
    
         
             
            #       "value": "li"
         
     | 
| 
       1264 
1405 
     | 
    
         
             
            #     },
         
     | 
| 
       1265 
1406 
     | 
    
         
             
            #     {
         
     | 
| 
       1266 
1407 
     | 
    
         
             
            #       "position": 9,
         
     | 
| 
      
 1408 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1267 
1409 
     | 
    
         
             
            #       "value": "ic"
         
     | 
| 
       1268 
1410 
     | 
    
         
             
            #     },
         
     | 
| 
       1269 
1411 
     | 
    
         
             
            #     {
         
     | 
| 
       1270 
1412 
     | 
    
         
             
            #       "position": 10,
         
     | 
| 
      
 1413 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1271 
1414 
     | 
    
         
             
            #       "value": "ce"
         
     | 
| 
       1272 
1415 
     | 
    
         
             
            #     },
         
     | 
| 
       1273 
1416 
     | 
    
         
             
            #     {
         
     | 
| 
       1274 
1417 
     | 
    
         
             
            #       "position": 11,
         
     | 
| 
      
 1418 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1275 
1419 
     | 
    
         
             
            #       "value": "e/"
         
     | 
| 
       1276 
1420 
     | 
    
         
             
            #     },
         
     | 
| 
       1277 
1421 
     | 
    
         
             
            #     {
         
     | 
| 
       1278 
1422 
     | 
    
         
             
            #       "position": 12,
         
     | 
| 
      
 1423 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1279 
1424 
     | 
    
         
             
            #       "value": "/t"
         
     | 
| 
       1280 
1425 
     | 
    
         
             
            #     },
         
     | 
| 
       1281 
1426 
     | 
    
         
             
            #     {
         
     | 
| 
       1282 
1427 
     | 
    
         
             
            #       "position": 13,
         
     | 
| 
      
 1428 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1283 
1429 
     | 
    
         
             
            #       "value": "te"
         
     | 
| 
       1284 
1430 
     | 
    
         
             
            #     },
         
     | 
| 
       1285 
1431 
     | 
    
         
             
            #     {
         
     | 
| 
       1286 
1432 
     | 
    
         
             
            #       "position": 14,
         
     | 
| 
      
 1433 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1287 
1434 
     | 
    
         
             
            #       "value": "es"
         
     | 
| 
       1288 
1435 
     | 
    
         
             
            #     },
         
     | 
| 
       1289 
1436 
     | 
    
         
             
            #     {
         
     | 
| 
       1290 
1437 
     | 
    
         
             
            #       "position": 15,
         
     | 
| 
      
 1438 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1291 
1439 
     | 
    
         
             
            #       "value": "st"
         
     | 
| 
       1292 
1440 
     | 
    
         
             
            #     },
         
     | 
| 
       1293 
1441 
     | 
    
         
             
            #     {
         
     | 
| 
       1294 
1442 
     | 
    
         
             
            #       "position": 16,
         
     | 
| 
      
 1443 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1295 
1444 
     | 
    
         
             
            #       "value": "t."
         
     | 
| 
       1296 
1445 
     | 
    
         
             
            #     },
         
     | 
| 
       1297 
1446 
     | 
    
         
             
            #     {
         
     | 
| 
       1298 
1447 
     | 
    
         
             
            #       "position": 17,
         
     | 
| 
      
 1448 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1299 
1449 
     | 
    
         
             
            #       "value": ".t"
         
     | 
| 
       1300 
1450 
     | 
    
         
             
            #     },
         
     | 
| 
       1301 
1451 
     | 
    
         
             
            #     {
         
     | 
| 
       1302 
1452 
     | 
    
         
             
            #       "position": 18,
         
     | 
| 
      
 1453 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1303 
1454 
     | 
    
         
             
            #       "value": "tx"
         
     | 
| 
       1304 
1455 
     | 
    
         
             
            #     },
         
     | 
| 
       1305 
1456 
     | 
    
         
             
            #     {
         
     | 
| 
       1306 
1457 
     | 
    
         
             
            #       "position": 19,
         
     | 
| 
      
 1458 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1307 
1459 
     | 
    
         
             
            #       "value": "xt"
         
     | 
| 
       1308 
1460 
     | 
    
         
             
            #     },
         
     | 
| 
       1309 
1461 
     | 
    
         
             
            #     {
         
     | 
| 
       1310 
1462 
     | 
    
         
             
            #       "position": 20,
         
     | 
| 
      
 1463 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1311 
1464 
     | 
    
         
             
            #       "value": "t"
         
     | 
| 
       1312 
1465 
     | 
    
         
             
            #     },
         
     | 
| 
       1313 
1466 
     | 
    
         
             
            #     {
         
     | 
| 
       1314 
1467 
     | 
    
         
             
            #       "position": 21,
         
     | 
| 
       1315 
     | 
    
         
            -
            #       " 
     | 
| 
       1316 
     | 
    
         
            -
            #     }
         
     | 
| 
       1317 
     | 
    
         
            -
            #   ]
         
     | 
| 
       1318 
     | 
    
         
            -
            # ]
         
     | 
| 
       1319 
     | 
    
         
            -
            </pre></div>
         
     | 
| 
       1320 
     | 
    
         
            -
            </div>
         
     | 
| 
       1321 
     | 
    
         
            -
            <p>The beginning of text mark is used for the beginning of text search by
         
     | 
| 
       1322 
     | 
    
         
            -
            <tt class="docutils literal"><span class="pre">\A</span></tt>. If you use <tt class="docutils literal"><span class="pre">TokenRegexp</span></tt> for tokenizing query,
         
     | 
| 
       1323 
     | 
    
         
            -
            <tt class="docutils literal"><span class="pre">TokenRegexp</span></tt> adds the beginning of text mark (<tt class="docutils literal"><span class="pre">U+FFEF</span></tt>) as the
         
     | 
| 
       1324 
     | 
    
         
            -
            first token. The beginning of text mark must be appeared at the first,
         
     | 
| 
       1325 
     | 
    
         
            -
            you can get results of the beginning of text search.</p>
         
     | 
| 
       1326 
     | 
    
         
            -
            <p>Execution example:</p>
         
     | 
| 
       1327 
     | 
    
         
            -
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenRegexp "\\A/home/alice/" NormalizerAuto --mode GET
         
     | 
| 
       1328 
     | 
    
         
            -
            # [
         
     | 
| 
       1329 
     | 
    
         
            -
            #   [
         
     | 
| 
       1330 
     | 
    
         
            -
            #     0,
         
     | 
| 
       1331 
     | 
    
         
            -
            #     1337566253.89858,
         
     | 
| 
       1332 
     | 
    
         
            -
            #     0.000355720520019531
         
     | 
| 
       1333 
     | 
    
         
            -
            #   ],
         
     | 
| 
       1334 
     | 
    
         
            -
            #   [
         
     | 
| 
       1335 
     | 
    
         
            -
            #     {
         
     | 
| 
       1336 
     | 
    
         
            -
            #       "position": 0,
         
     | 
| 
       1337 
     | 
    
         
            -
            #       "value": ""
         
     | 
| 
       1338 
     | 
    
         
            -
            #     },
         
     | 
| 
       1339 
     | 
    
         
            -
            #     {
         
     | 
| 
       1340 
     | 
    
         
            -
            #       "position": 1,
         
     | 
| 
       1341 
     | 
    
         
            -
            #       "value": "/h"
         
     | 
| 
       1342 
     | 
    
         
            -
            #     },
         
     | 
| 
       1343 
     | 
    
         
            -
            #     {
         
     | 
| 
       1344 
     | 
    
         
            -
            #       "position": 2,
         
     | 
| 
       1345 
     | 
    
         
            -
            #       "value": "ho"
         
     | 
| 
       1346 
     | 
    
         
            -
            #     },
         
     | 
| 
       1347 
     | 
    
         
            -
            #     {
         
     | 
| 
       1348 
     | 
    
         
            -
            #       "position": 3,
         
     | 
| 
       1349 
     | 
    
         
            -
            #       "value": "om"
         
     | 
| 
       1350 
     | 
    
         
            -
            #     },
         
     | 
| 
       1351 
     | 
    
         
            -
            #     {
         
     | 
| 
       1352 
     | 
    
         
            -
            #       "position": 4,
         
     | 
| 
       1353 
     | 
    
         
            -
            #       "value": "me"
         
     | 
| 
       1354 
     | 
    
         
            -
            #     },
         
     | 
| 
       1355 
     | 
    
         
            -
            #     {
         
     | 
| 
       1356 
     | 
    
         
            -
            #       "position": 5,
         
     | 
| 
       1357 
     | 
    
         
            -
            #       "value": "e/"
         
     | 
| 
       1358 
     | 
    
         
            -
            #     },
         
     | 
| 
       1359 
     | 
    
         
            -
            #     {
         
     | 
| 
       1360 
     | 
    
         
            -
            #       "position": 6,
         
     | 
| 
       1361 
     | 
    
         
            -
            #       "value": "/a"
         
     | 
| 
       1362 
     | 
    
         
            -
            #     },
         
     | 
| 
       1363 
     | 
    
         
            -
            #     {
         
     | 
| 
       1364 
     | 
    
         
            -
            #       "position": 7,
         
     | 
| 
       1365 
     | 
    
         
            -
            #       "value": "al"
         
     | 
| 
       1366 
     | 
    
         
            -
            #     },
         
     | 
| 
       1367 
     | 
    
         
            -
            #     {
         
     | 
| 
       1368 
     | 
    
         
            -
            #       "position": 8,
         
     | 
| 
       1369 
     | 
    
         
            -
            #       "value": "li"
         
     | 
| 
       1370 
     | 
    
         
            -
            #     },
         
     | 
| 
       1371 
     | 
    
         
            -
            #     {
         
     | 
| 
       1372 
     | 
    
         
            -
            #       "position": 9,
         
     | 
| 
       1373 
     | 
    
         
            -
            #       "value": "ic"
         
     | 
| 
       1374 
     | 
    
         
            -
            #     },
         
     | 
| 
       1375 
     | 
    
         
            -
            #     {
         
     | 
| 
       1376 
     | 
    
         
            -
            #       "position": 10,
         
     | 
| 
       1377 
     | 
    
         
            -
            #       "value": "ce"
         
     | 
| 
       1378 
     | 
    
         
            -
            #     },
         
     | 
| 
       1379 
     | 
    
         
            -
            #     {
         
     | 
| 
       1380 
     | 
    
         
            -
            #       "position": 11,
         
     | 
| 
       1381 
     | 
    
         
            -
            #       "value": "e/"
         
     | 
| 
       1382 
     | 
    
         
            -
            #     }
         
     | 
| 
       1383 
     | 
    
         
            -
            #   ]
         
     | 
| 
       1384 
     | 
    
         
            -
            # ]
         
     | 
| 
       1385 
     | 
    
         
            -
            </pre></div>
         
     | 
| 
       1386 
     | 
    
         
            -
            </div>
         
     | 
| 
       1387 
     | 
    
         
            -
            <p>The end of text mark is used for the end of text search by <tt class="docutils literal"><span class="pre">\z</span></tt>.
         
     | 
| 
       1388 
     | 
    
         
            -
            If you use <tt class="docutils literal"><span class="pre">TokenRegexp</span></tt> for tokenizing query, <tt class="docutils literal"><span class="pre">TokenRegexp</span></tt> adds
         
     | 
| 
       1389 
     | 
    
         
            -
            the end of text mark (<tt class="docutils literal"><span class="pre">U+FFF0</span></tt>) as the last token. The end of text
         
     | 
| 
       1390 
     | 
    
         
            -
            mark must be appeared at the end, you can get results of the end of
         
     | 
| 
       1391 
     | 
    
         
            -
            text search.</p>
         
     | 
| 
       1392 
     | 
    
         
            -
            <p>Execution example:</p>
         
     | 
| 
       1393 
     | 
    
         
            -
            <div class="highlight-none"><div class="highlight"><pre>tokenize TokenRegexp "\\.txt\\z" NormalizerAuto --mode GET
         
     | 
| 
       1394 
     | 
    
         
            -
            # [
         
     | 
| 
       1395 
     | 
    
         
            -
            #   [
         
     | 
| 
       1396 
     | 
    
         
            -
            #     0,
         
     | 
| 
       1397 
     | 
    
         
            -
            #     1337566253.89858,
         
     | 
| 
       1398 
     | 
    
         
            -
            #     0.000355720520019531
         
     | 
| 
       1399 
     | 
    
         
            -
            #   ],
         
     | 
| 
       1400 
     | 
    
         
            -
            #   [
         
     | 
| 
       1401 
     | 
    
         
            -
            #     {
         
     | 
| 
       1402 
     | 
    
         
            -
            #       "position": 0,
         
     | 
| 
       1403 
     | 
    
         
            -
            #       "value": "\\."
         
     | 
| 
       1404 
     | 
    
         
            -
            #     },
         
     | 
| 
       1405 
     | 
    
         
            -
            #     {
         
     | 
| 
       1406 
     | 
    
         
            -
            #       "position": 1,
         
     | 
| 
       1407 
     | 
    
         
            -
            #       "value": ".t"
         
     | 
| 
       1408 
     | 
    
         
            -
            #     },
         
     | 
| 
       1409 
     | 
    
         
            -
            #     {
         
     | 
| 
       1410 
     | 
    
         
            -
            #       "position": 2,
         
     | 
| 
       1411 
     | 
    
         
            -
            #       "value": "tx"
         
     | 
| 
       1412 
     | 
    
         
            -
            #     },
         
     | 
| 
       1413 
     | 
    
         
            -
            #     {
         
     | 
| 
       1414 
     | 
    
         
            -
            #       "position": 3,
         
     | 
| 
       1415 
     | 
    
         
            -
            #       "value": "xt"
         
     | 
| 
       1416 
     | 
    
         
            -
            #     },
         
     | 
| 
       1417 
     | 
    
         
            -
            #     {
         
     | 
| 
       1418 
     | 
    
         
            -
            #       "position": 5,
         
     | 
| 
      
 1468 
     | 
    
         
            +
            #       "force_prefix": false,
         
     | 
| 
       1419 
1469 
     | 
    
         
             
            #       "value": ""
         
     | 
| 
       1420 
1470 
     | 
    
         
             
            #     }
         
     | 
| 
       1421 
1471 
     | 
    
         
             
            #   ]
         
     | 
| 
         @@ -1430,7 +1480,7 @@ text search.</p> 
     | 
|
| 
       1430 
1480 
     | 
    
         
             
                      </div>
         
     | 
| 
       1431 
1481 
     | 
    
         
             
                    </div>
         
     | 
| 
       1432 
1482 
     | 
    
         
             
                  </div>
         
     | 
| 
       1433 
     | 
    
         
            -
                  <div class="sphinxsidebar">
         
     | 
| 
      
 1483 
     | 
    
         
            +
                  <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
         
     | 
| 
       1434 
1484 
     | 
    
         
             
                    <div class="sphinxsidebarwrapper">
         
     | 
| 
       1435 
1485 
     | 
    
         
             
              <h3><a href="../index.html">Table Of Contents</a></h3>
         
     | 
| 
       1436 
1486 
     | 
    
         
             
              <ul>
         
     | 
| 
         @@ -1438,20 +1488,20 @@ text search.</p> 
     | 
|
| 
       1438 
1488 
     | 
    
         
             
            <li><a class="reference internal" href="#summary">7.8.1. Summary</a></li>
         
     | 
| 
       1439 
1489 
     | 
    
         
             
            <li><a class="reference internal" href="#what-is-tokenize">7.8.2. What is "tokenize"?</a></li>
         
     | 
| 
       1440 
1490 
     | 
    
         
             
            <li><a class="reference internal" href="#built-in-tokenizsers">7.8.3. Built-in tokenizsers</a><ul>
         
     | 
| 
       1441 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokenbigram">7.8.3.1. < 
     | 
| 
       1442 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokenbigramsplitsymbol">7.8.3.2. < 
     | 
| 
       1443 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokenbigramsplitsymbolalpha">7.8.3.3. < 
     | 
| 
       1444 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokenbigramsplitsymbolalphadigit">7.8.3.4. < 
     | 
| 
       1445 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokenbigramignoreblank">7.8.3.5. < 
     | 
| 
       1446 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokenbigramignoreblanksplitsymbol">7.8.3.6. < 
     | 
| 
       1447 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokenbigramignoreblanksplitsymbolalpha">7.8.3.7. < 
     | 
| 
       1448 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokenbigramignoreblanksplitsymbolalphadigit">7.8.3.8. < 
     | 
| 
       1449 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokenunigram">7.8.3.9. < 
     | 
| 
       1450 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokentrigram">7.8.3.10. < 
     | 
| 
       1451 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokendelimit">7.8.3.11. < 
     | 
| 
       1452 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokendelimitnull">7.8.3.12. < 
     | 
| 
       1453 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokenmecab">7.8.3.13. < 
     | 
| 
       1454 
     | 
    
         
            -
            <li><a class="reference internal" href="#tokenregexp">7.8.3.14. < 
     | 
| 
      
 1491 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokenbigram">7.8.3.1. <code class="docutils literal"><span class="pre">TokenBigram</span></code></a></li>
         
     | 
| 
      
 1492 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokenbigramsplitsymbol">7.8.3.2. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbol</span></code></a></li>
         
     | 
| 
      
 1493 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokenbigramsplitsymbolalpha">7.8.3.3. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlpha</span></code></a></li>
         
     | 
| 
      
 1494 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokenbigramsplitsymbolalphadigit">7.8.3.4. <code class="docutils literal"><span class="pre">TokenBigramSplitSymbolAlphaDigit</span></code></a></li>
         
     | 
| 
      
 1495 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokenbigramignoreblank">7.8.3.5. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlank</span></code></a></li>
         
     | 
| 
      
 1496 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokenbigramignoreblanksplitsymbol">7.8.3.6. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbol</span></code></a></li>
         
     | 
| 
      
 1497 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokenbigramignoreblanksplitsymbolalpha">7.8.3.7. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlpha</span></code></a></li>
         
     | 
| 
      
 1498 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokenbigramignoreblanksplitsymbolalphadigit">7.8.3.8. <code class="docutils literal"><span class="pre">TokenBigramIgnoreBlankSplitSymbolAlphaDigit</span></code></a></li>
         
     | 
| 
      
 1499 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokenunigram">7.8.3.9. <code class="docutils literal"><span class="pre">TokenUnigram</span></code></a></li>
         
     | 
| 
      
 1500 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokentrigram">7.8.3.10. <code class="docutils literal"><span class="pre">TokenTrigram</span></code></a></li>
         
     | 
| 
      
 1501 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokendelimit">7.8.3.11. <code class="docutils literal"><span class="pre">TokenDelimit</span></code></a></li>
         
     | 
| 
      
 1502 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokendelimitnull">7.8.3.12. <code class="docutils literal"><span class="pre">TokenDelimitNull</span></code></a></li>
         
     | 
| 
      
 1503 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokenmecab">7.8.3.13. <code class="docutils literal"><span class="pre">TokenMecab</span></code></a></li>
         
     | 
| 
      
 1504 
     | 
    
         
            +
            <li><a class="reference internal" href="#tokenregexp">7.8.3.14. <code class="docutils literal"><span class="pre">TokenRegexp</span></code></a></li>
         
     | 
| 
       1455 
1505 
     | 
    
         
             
            </ul>
         
     | 
| 
       1456 
1506 
     | 
    
         
             
            </li>
         
     | 
| 
       1457 
1507 
     | 
    
         
             
            </ul>
         
     | 
| 
         @@ -1464,12 +1514,14 @@ text search.</p> 
     | 
|
| 
       1464 
1514 
     | 
    
         
             
              <h4>Next topic</h4>
         
     | 
| 
       1465 
1515 
     | 
    
         
             
              <p class="topless"><a href="token_filters.html"
         
     | 
| 
       1466 
1516 
     | 
    
         
             
                                    title="next chapter">7.9. Token filters</a></p>
         
     | 
| 
       1467 
     | 
    
         
            -
              < 
     | 
| 
       1468 
     | 
    
         
            -
             
     | 
| 
       1469 
     | 
    
         
            -
                < 
     | 
| 
       1470 
     | 
    
         
            -
             
     | 
| 
       1471 
     | 
    
         
            -
             
     | 
| 
       1472 
     | 
    
         
            -
             
     | 
| 
      
 1517 
     | 
    
         
            +
              <div role="note" aria-label="source link">
         
     | 
| 
      
 1518 
     | 
    
         
            +
                <h3>This Page</h3>
         
     | 
| 
      
 1519 
     | 
    
         
            +
                <ul class="this-page-menu">
         
     | 
| 
      
 1520 
     | 
    
         
            +
                  <li><a href="../_sources/reference/tokenizers.txt"
         
     | 
| 
      
 1521 
     | 
    
         
            +
                        rel="nofollow">Show Source</a></li>
         
     | 
| 
      
 1522 
     | 
    
         
            +
                </ul>
         
     | 
| 
      
 1523 
     | 
    
         
            +
               </div>
         
     | 
| 
      
 1524 
     | 
    
         
            +
            <div id="searchbox" style="display: none" role="search">
         
     | 
| 
       1473 
1525 
     | 
    
         
             
              <h3>Quick search</h3>
         
     | 
| 
       1474 
1526 
     | 
    
         
             
                <form class="search" action="../search.html" method="get">
         
     | 
| 
       1475 
1527 
     | 
    
         
             
                  <input type="text" name="q" />
         
     | 
| 
         @@ -1486,7 +1538,7 @@ text search.</p> 
     | 
|
| 
       1486 
1538 
     | 
    
         
             
                  </div>
         
     | 
| 
       1487 
1539 
     | 
    
         
             
                  <div class="clearer"></div>
         
     | 
| 
       1488 
1540 
     | 
    
         
             
                </div>
         
     | 
| 
       1489 
     | 
    
         
            -
                <div class="related">
         
     | 
| 
      
 1541 
     | 
    
         
            +
                <div class="related" role="navigation" aria-label="related navigation">
         
     | 
| 
       1490 
1542 
     | 
    
         
             
                  <h3>Navigation</h3>
         
     | 
| 
       1491 
1543 
     | 
    
         
             
                  <ul>
         
     | 
| 
       1492 
1544 
     | 
    
         
             
                    <li class="right" style="margin-right: 10px">
         
     | 
| 
         @@ -1498,11 +1550,11 @@ text search.</p> 
     | 
|
| 
       1498 
1550 
     | 
    
         
             
                    <li class="right" >
         
     | 
| 
       1499 
1551 
     | 
    
         
             
                      <a href="normalizers.html" title="7.7. Normalizers"
         
     | 
| 
       1500 
1552 
     | 
    
         
             
                         >previous</a> |</li>
         
     | 
| 
       1501 
     | 
    
         
            -
                    <li><a href="../index.html">Groonga v5.0. 
     | 
| 
       1502 
     | 
    
         
            -
                      <li><a href="../reference.html" >7. Reference manual</a> »</li> 
         
     | 
| 
      
 1553 
     | 
    
         
            +
                    <li class="nav-item nav-item-0"><a href="../index.html">Groonga v5.0.6-226-gd7da7e7 documentation</a> »</li>
         
     | 
| 
      
 1554 
     | 
    
         
            +
                      <li class="nav-item nav-item-1"><a href="../reference.html" >7. Reference manual</a> »</li> 
         
     | 
| 
       1503 
1555 
     | 
    
         
             
                  </ul>
         
     | 
| 
       1504 
1556 
     | 
    
         
             
                </div>
         
     | 
| 
       1505 
     | 
    
         
            -
                <div class="footer">
         
     | 
| 
      
 1557 
     | 
    
         
            +
                <div class="footer" role="contentinfo">
         
     | 
| 
       1506 
1558 
     | 
    
         
             
                    © Copyright 2009-2015, Brazil, Inc.
         
     | 
| 
       1507 
1559 
     | 
    
         
             
                </div>
         
     | 
| 
       1508 
1560 
     | 
    
         
             
              </body>
         
     |