ferret 0.9.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/MIT-LICENSE +1 -1
- data/README +12 -24
- data/Rakefile +38 -54
- data/TODO +14 -17
- data/ext/analysis.c +982 -823
- data/ext/analysis.h +133 -76
- data/ext/array.c +96 -58
- data/ext/array.h +40 -13
- data/ext/bitvector.c +476 -118
- data/ext/bitvector.h +264 -22
- data/ext/compound_io.c +217 -229
- data/ext/defines.h +49 -0
- data/ext/document.c +107 -317
- data/ext/document.h +31 -65
- data/ext/except.c +81 -36
- data/ext/except.h +117 -55
- data/ext/extconf.rb +2 -9
- data/ext/ferret.c +211 -104
- data/ext/ferret.h +22 -11
- data/ext/filter.c +97 -82
- data/ext/fs_store.c +348 -367
- data/ext/global.c +226 -188
- data/ext/global.h +44 -26
- data/ext/hash.c +474 -391
- data/ext/hash.h +441 -68
- data/ext/hashset.c +124 -96
- data/ext/hashset.h +169 -20
- data/ext/helper.c +56 -5
- data/ext/helper.h +7 -0
- data/ext/inc/lang.h +29 -49
- data/ext/inc/threading.h +31 -0
- data/ext/ind.c +288 -278
- data/ext/ind.h +68 -0
- data/ext/index.c +5688 -0
- data/ext/index.h +663 -616
- data/ext/lang.h +29 -49
- data/ext/libstemmer.c +3 -3
- data/ext/mem_pool.c +84 -0
- data/ext/mem_pool.h +35 -0
- data/ext/posh.c +1006 -0
- data/ext/posh.h +1007 -0
- data/ext/priorityqueue.c +117 -194
- data/ext/priorityqueue.h +135 -39
- data/ext/q_boolean.c +1305 -1108
- data/ext/q_const_score.c +106 -93
- data/ext/q_filtered_query.c +138 -135
- data/ext/q_fuzzy.c +206 -242
- data/ext/q_match_all.c +94 -80
- data/ext/q_multi_term.c +663 -0
- data/ext/q_parser.c +667 -593
- data/ext/q_phrase.c +992 -555
- data/ext/q_prefix.c +72 -61
- data/ext/q_range.c +235 -210
- data/ext/q_span.c +1480 -1166
- data/ext/q_term.c +273 -246
- data/ext/q_wildcard.c +127 -114
- data/ext/r_analysis.c +1720 -711
- data/ext/r_index.c +3049 -0
- data/ext/r_qparser.c +433 -146
- data/ext/r_search.c +2934 -1993
- data/ext/r_store.c +372 -143
- data/ext/r_utils.c +941 -0
- data/ext/ram_store.c +330 -326
- data/ext/search.c +1291 -668
- data/ext/search.h +403 -702
- data/ext/similarity.c +91 -113
- data/ext/similarity.h +45 -30
- data/ext/sort.c +721 -484
- data/ext/stopwords.c +361 -273
- data/ext/store.c +556 -58
- data/ext/store.h +706 -126
- data/ext/tags +3578 -2780
- data/ext/term_vectors.c +352 -0
- data/ext/threading.h +31 -0
- data/ext/win32.h +54 -0
- data/lib/ferret.rb +5 -17
- data/lib/ferret/document.rb +130 -2
- data/lib/ferret/index.rb +577 -26
- data/lib/ferret/number_tools.rb +157 -0
- data/lib/ferret_version.rb +3 -0
- data/test/test_helper.rb +5 -13
- data/test/unit/analysis/tc_analyzer.rb +513 -1
- data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
- data/test/unit/index/tc_index.rb +183 -240
- data/test/unit/index/tc_index_reader.rb +312 -479
- data/test/unit/index/tc_index_writer.rb +397 -13
- data/test/unit/index/th_doc.rb +269 -206
- data/test/unit/query_parser/tc_query_parser.rb +40 -33
- data/test/unit/search/tc_filter.rb +59 -71
- data/test/unit/search/tc_fuzzy_query.rb +24 -16
- data/test/unit/search/tc_index_searcher.rb +23 -201
- data/test/unit/search/tc_multi_searcher.rb +78 -226
- data/test/unit/search/tc_search_and_sort.rb +93 -81
- data/test/unit/search/tc_sort.rb +23 -23
- data/test/unit/search/tc_sort_field.rb +7 -7
- data/test/unit/search/tc_spans.rb +51 -47
- data/test/unit/search/tm_searcher.rb +339 -0
- data/test/unit/store/tc_fs_store.rb +1 -1
- data/test/unit/store/tm_store_lock.rb +3 -3
- data/test/unit/tc_document.rb +81 -0
- data/test/unit/ts_analysis.rb +1 -1
- data/test/unit/ts_utils.rb +1 -1
- data/test/unit/utils/tc_bit_vector.rb +288 -0
- data/test/unit/utils/tc_number_tools.rb +117 -0
- data/test/unit/utils/tc_priority_queue.rb +106 -0
- metadata +140 -301
- data/CHANGELOG +0 -9
- data/ext/dummy.exe +0 -0
- data/ext/field.c +0 -408
- data/ext/frtio.h +0 -13
- data/ext/inc/except.h +0 -90
- data/ext/index_io.c +0 -382
- data/ext/index_rw.c +0 -2658
- data/ext/lang.c +0 -41
- data/ext/nix_io.c +0 -134
- data/ext/q_multi_phrase.c +0 -380
- data/ext/r_doc.c +0 -582
- data/ext/r_index_io.c +0 -1021
- data/ext/r_term.c +0 -219
- data/ext/term.c +0 -820
- data/ext/termdocs.c +0 -611
- data/ext/vector.c +0 -637
- data/ext/w32_io.c +0 -150
- data/lib/ferret/analysis.rb +0 -11
- data/lib/ferret/analysis/analyzers.rb +0 -112
- data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
- data/lib/ferret/analysis/token.rb +0 -100
- data/lib/ferret/analysis/token_filters.rb +0 -86
- data/lib/ferret/analysis/token_stream.rb +0 -26
- data/lib/ferret/analysis/tokenizers.rb +0 -112
- data/lib/ferret/analysis/word_list_loader.rb +0 -27
- data/lib/ferret/document/document.rb +0 -152
- data/lib/ferret/document/field.rb +0 -312
- data/lib/ferret/index/compound_file_io.rb +0 -338
- data/lib/ferret/index/document_writer.rb +0 -289
- data/lib/ferret/index/field_infos.rb +0 -279
- data/lib/ferret/index/fields_io.rb +0 -181
- data/lib/ferret/index/index.rb +0 -675
- data/lib/ferret/index/index_file_names.rb +0 -33
- data/lib/ferret/index/index_reader.rb +0 -503
- data/lib/ferret/index/index_writer.rb +0 -534
- data/lib/ferret/index/multi_reader.rb +0 -377
- data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
- data/lib/ferret/index/segment_infos.rb +0 -130
- data/lib/ferret/index/segment_merge_info.rb +0 -49
- data/lib/ferret/index/segment_merge_queue.rb +0 -16
- data/lib/ferret/index/segment_merger.rb +0 -358
- data/lib/ferret/index/segment_reader.rb +0 -412
- data/lib/ferret/index/segment_term_enum.rb +0 -169
- data/lib/ferret/index/segment_term_vector.rb +0 -58
- data/lib/ferret/index/term.rb +0 -53
- data/lib/ferret/index/term_buffer.rb +0 -83
- data/lib/ferret/index/term_doc_enum.rb +0 -291
- data/lib/ferret/index/term_enum.rb +0 -52
- data/lib/ferret/index/term_info.rb +0 -37
- data/lib/ferret/index/term_infos_io.rb +0 -321
- data/lib/ferret/index/term_vector_offset_info.rb +0 -20
- data/lib/ferret/index/term_vectors_io.rb +0 -553
- data/lib/ferret/query_parser.rb +0 -312
- data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
- data/lib/ferret/search.rb +0 -50
- data/lib/ferret/search/boolean_clause.rb +0 -100
- data/lib/ferret/search/boolean_query.rb +0 -299
- data/lib/ferret/search/boolean_scorer.rb +0 -294
- data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
- data/lib/ferret/search/conjunction_scorer.rb +0 -99
- data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
- data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
- data/lib/ferret/search/explanation.rb +0 -41
- data/lib/ferret/search/field_cache.rb +0 -215
- data/lib/ferret/search/field_doc.rb +0 -31
- data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
- data/lib/ferret/search/filter.rb +0 -11
- data/lib/ferret/search/filtered_query.rb +0 -130
- data/lib/ferret/search/filtered_term_enum.rb +0 -79
- data/lib/ferret/search/fuzzy_query.rb +0 -154
- data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
- data/lib/ferret/search/hit_collector.rb +0 -34
- data/lib/ferret/search/hit_queue.rb +0 -11
- data/lib/ferret/search/index_searcher.rb +0 -200
- data/lib/ferret/search/match_all_query.rb +0 -104
- data/lib/ferret/search/multi_phrase_query.rb +0 -216
- data/lib/ferret/search/multi_searcher.rb +0 -261
- data/lib/ferret/search/multi_term_query.rb +0 -65
- data/lib/ferret/search/non_matching_scorer.rb +0 -22
- data/lib/ferret/search/phrase_positions.rb +0 -55
- data/lib/ferret/search/phrase_query.rb +0 -214
- data/lib/ferret/search/phrase_scorer.rb +0 -152
- data/lib/ferret/search/prefix_query.rb +0 -54
- data/lib/ferret/search/query.rb +0 -140
- data/lib/ferret/search/query_filter.rb +0 -51
- data/lib/ferret/search/range_filter.rb +0 -103
- data/lib/ferret/search/range_query.rb +0 -139
- data/lib/ferret/search/req_excl_scorer.rb +0 -125
- data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
- data/lib/ferret/search/score_doc.rb +0 -38
- data/lib/ferret/search/score_doc_comparator.rb +0 -114
- data/lib/ferret/search/scorer.rb +0 -91
- data/lib/ferret/search/similarity.rb +0 -278
- data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
- data/lib/ferret/search/sort.rb +0 -112
- data/lib/ferret/search/sort_comparator.rb +0 -60
- data/lib/ferret/search/sort_field.rb +0 -91
- data/lib/ferret/search/spans.rb +0 -12
- data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
- data/lib/ferret/search/spans/span_first_query.rb +0 -79
- data/lib/ferret/search/spans/span_near_query.rb +0 -108
- data/lib/ferret/search/spans/span_not_query.rb +0 -130
- data/lib/ferret/search/spans/span_or_query.rb +0 -176
- data/lib/ferret/search/spans/span_query.rb +0 -25
- data/lib/ferret/search/spans/span_scorer.rb +0 -74
- data/lib/ferret/search/spans/span_term_query.rb +0 -105
- data/lib/ferret/search/spans/span_weight.rb +0 -84
- data/lib/ferret/search/spans/spans_enum.rb +0 -44
- data/lib/ferret/search/term_query.rb +0 -128
- data/lib/ferret/search/term_scorer.rb +0 -183
- data/lib/ferret/search/top_docs.rb +0 -36
- data/lib/ferret/search/top_field_docs.rb +0 -17
- data/lib/ferret/search/weight.rb +0 -54
- data/lib/ferret/search/wildcard_query.rb +0 -26
- data/lib/ferret/search/wildcard_term_enum.rb +0 -61
- data/lib/ferret/stemmers.rb +0 -1
- data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
- data/lib/ferret/store.rb +0 -5
- data/lib/ferret/store/buffered_index_io.rb +0 -190
- data/lib/ferret/store/directory.rb +0 -141
- data/lib/ferret/store/fs_store.rb +0 -381
- data/lib/ferret/store/index_io.rb +0 -245
- data/lib/ferret/store/ram_store.rb +0 -286
- data/lib/ferret/utils.rb +0 -8
- data/lib/ferret/utils/bit_vector.rb +0 -123
- data/lib/ferret/utils/date_tools.rb +0 -138
- data/lib/ferret/utils/number_tools.rb +0 -91
- data/lib/ferret/utils/parameter.rb +0 -41
- data/lib/ferret/utils/priority_queue.rb +0 -120
- data/lib/ferret/utils/string_helper.rb +0 -47
- data/lib/ferret/utils/thread_local.rb +0 -28
- data/lib/ferret/utils/weak_key_hash.rb +0 -60
- data/lib/rferret.rb +0 -37
- data/rake_utils/code_statistics.rb +0 -106
- data/test/benchmark/tb_ram_store.rb +0 -76
- data/test/benchmark/tb_rw_vint.rb +0 -26
- data/test/functional/thread_safety_index_test.rb +0 -81
- data/test/functional/thread_safety_test.rb +0 -137
- data/test/longrunning/tc_numbertools.rb +0 -60
- data/test/longrunning/tm_store.rb +0 -19
- data/test/unit/analysis/ctc_analyzer.rb +0 -532
- data/test/unit/analysis/data/wordfile +0 -6
- data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
- data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
- data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
- data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
- data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
- data/test/unit/analysis/rtc_stop_filter.rb +0 -14
- data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
- data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
- data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
- data/test/unit/analysis/tc_token.rb +0 -25
- data/test/unit/document/rtc_field.rb +0 -28
- data/test/unit/document/tc_document.rb +0 -47
- data/test/unit/document/tc_field.rb +0 -98
- data/test/unit/index/rtc_compound_file_io.rb +0 -107
- data/test/unit/index/rtc_field_infos.rb +0 -127
- data/test/unit/index/rtc_fields_io.rb +0 -167
- data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
- data/test/unit/index/rtc_segment_infos.rb +0 -74
- data/test/unit/index/rtc_segment_term_docs.rb +0 -17
- data/test/unit/index/rtc_segment_term_enum.rb +0 -60
- data/test/unit/index/rtc_segment_term_vector.rb +0 -71
- data/test/unit/index/rtc_term_buffer.rb +0 -57
- data/test/unit/index/rtc_term_info.rb +0 -19
- data/test/unit/index/rtc_term_infos_io.rb +0 -192
- data/test/unit/index/rtc_term_vectors_io.rb +0 -108
- data/test/unit/index/tc_term.rb +0 -27
- data/test/unit/index/tc_term_voi.rb +0 -18
- data/test/unit/search/rtc_similarity.rb +0 -37
- data/test/unit/search/rtc_sort_field.rb +0 -14
- data/test/unit/search/tc_multi_searcher2.rb +0 -126
- data/test/unit/store/rtc_fs_store.rb +0 -62
- data/test/unit/store/rtc_ram_store.rb +0 -15
- data/test/unit/store/rtm_store.rb +0 -150
- data/test/unit/store/rtm_store_lock.rb +0 -2
- data/test/unit/ts_document.rb +0 -2
- data/test/unit/utils/rtc_bit_vector.rb +0 -73
- data/test/unit/utils/rtc_date_tools.rb +0 -50
- data/test/unit/utils/rtc_number_tools.rb +0 -59
- data/test/unit/utils/rtc_parameter.rb +0 -40
- data/test/unit/utils/rtc_priority_queue.rb +0 -62
- data/test/unit/utils/rtc_string_helper.rb +0 -21
- data/test/unit/utils/rtc_thread.rb +0 -61
- data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
- data/test/utils/number_to_spoken.rb +0 -132
data/MIT-LICENSE
CHANGED
data/README
CHANGED
@@ -8,7 +8,7 @@ search for things in them later.
|
|
8
8
|
== Requirements
|
9
9
|
|
10
10
|
* Ruby 1.8
|
11
|
-
*
|
11
|
+
* C compiler to build the extension. Tested with gcc, VC6 and VC2005
|
12
12
|
|
13
13
|
== Installation
|
14
14
|
|
@@ -26,7 +26,7 @@ Run the following;
|
|
26
26
|
$ rake ext
|
27
27
|
$ ruby setup.rb config
|
28
28
|
$ ruby setup.rb setup
|
29
|
-
#
|
29
|
+
# ruby setup.rb install
|
30
30
|
|
31
31
|
These simple steps install ferret in the default location of Ruby libraries.
|
32
32
|
You can also install files into your favorite directory by supplying setup.rb
|
@@ -53,10 +53,8 @@ documentation.
|
|
53
53
|
abilities of Ferret to present your data the best way you see fit.
|
54
54
|
|
55
55
|
* Ferret::Document: to find out how to create documents. This part of Ferret
|
56
|
-
is relatively straightforward.
|
57
|
-
|
58
|
-
positions and offsets of the data which can be very useful in document
|
59
|
-
comparison amoung other things. == More information
|
56
|
+
is relatively straightforward. If you know how Strings, Hashes and Arrays work
|
57
|
+
Ferret then you'll be able to create Documents.
|
60
58
|
|
61
59
|
* Ferret::QueryParser: if you want to find out more about what you can do with
|
62
60
|
Ferret's Query Parser, this is the place to look. The query parser is one
|
@@ -71,17 +69,8 @@ documentation.
|
|
71
69
|
|
72
70
|
=== Performance
|
73
71
|
|
74
|
-
|
75
|
-
|
76
|
-
not have installed when you installed Ferret. These double the speed but still
|
77
|
-
leave it a lot slower than the Java version. I have, however, ported the
|
78
|
-
indexing part of Java Lucene to C and it is an order of magnitude faster then
|
79
|
-
the Java version. Once I'm pretty certain that the API of Ferret has settled
|
80
|
-
and won't be changing much, I'll intergrate my C version. So expect to see
|
81
|
-
Ferret running faster than Java Lucene some time in the future. If you'd like
|
82
|
-
to try cferret and test my claims, let me know (if you haven't already found
|
83
|
-
it in my subversion repository). It's not currently portable and will probably
|
84
|
-
only run on linux.
|
72
|
+
We are unaware of any alternatives that can out-perform Ferret while still
|
73
|
+
matching it in features.
|
85
74
|
|
86
75
|
== Contact
|
87
76
|
|
@@ -89,17 +78,16 @@ For bug reports and patches I have set up Trac here;
|
|
89
78
|
|
90
79
|
http://ferret.davebalmain.com/trac
|
91
80
|
|
92
|
-
Queries, discussion etc should be addressed to the
|
93
|
-
at;
|
81
|
+
Queries, discussion etc should be addressed to the mailing lists here;
|
94
82
|
|
95
83
|
http://rubyforge.org/projects/ferret/
|
96
84
|
|
97
|
-
Alternatively you could create a new page for discussion on the wiki
|
98
|
-
page above. Or, if you're shy, please feel free to email me directly at dbalmain@gmail.com
|
85
|
+
Alternatively you could create a new page for discussion on the Ferret wiki;
|
99
86
|
|
100
|
-
|
101
|
-
|
102
|
-
|
87
|
+
http://ferret.davebalmain.com/trac
|
88
|
+
|
89
|
+
Of course, since Ferret was ported from Apache Lucene, most of what you can
|
90
|
+
do with Lucene you can also do with Ferret.
|
103
91
|
|
104
92
|
== Authors
|
105
93
|
|
data/Rakefile
CHANGED
@@ -8,8 +8,7 @@ require 'rake'
|
|
8
8
|
require 'rake/testtask'
|
9
9
|
require 'rake/rdoctask'
|
10
10
|
require 'rake/clean'
|
11
|
-
require '
|
12
|
-
require 'lib/rferret'
|
11
|
+
require 'ferret_version'
|
13
12
|
|
14
13
|
begin
|
15
14
|
require 'rubygems'
|
@@ -29,18 +28,18 @@ def announce(msg='')
|
|
29
28
|
STDERR.puts msg
|
30
29
|
end
|
31
30
|
|
32
|
-
$VERBOSE = nil
|
33
|
-
|
34
31
|
EXT = "ferret_ext.so"
|
35
|
-
EXT_SRC = FileList["src
|
36
|
-
|
37
|
-
|
38
|
-
|
32
|
+
EXT_SRC = FileList["../c/src/*.[c]", "../c/include/*.h",
|
33
|
+
"../c/lib/libstemmer_c/src_c/*.[ch]",
|
34
|
+
"../c/lib/libstemmer_c/runtime/*.[ch]",
|
35
|
+
"../c/lib/libstemmer_c/libstemmer/*.[ch]",
|
36
|
+
"../c/lib/libstemmer_c/include/libstemmer.h"]
|
39
37
|
|
40
38
|
EXT_SRC_DEST = EXT_SRC.map {|fn| File.join("ext", File.basename(fn))}
|
41
39
|
SRC = (FileList["ext/*.[ch]"] + EXT_SRC_DEST).uniq
|
42
40
|
|
43
|
-
CLEAN.include(FileList['**/*.o', '**/*.obj', 'InstalledFiles',
|
41
|
+
CLEAN.include(FileList['**/*.o', '**/*.obj', 'InstalledFiles',
|
42
|
+
'.config', 'ext/cferret.c'])
|
44
43
|
CLOBBER.include(FileList['**/*.so'], 'ext/Makefile', EXT_SRC_DEST)
|
45
44
|
POLISH = Rake::FileList.new.include(FileList['**/*.so'], 'ext/Makefile')
|
46
45
|
|
@@ -49,69 +48,53 @@ task :polish => [:clean] do
|
|
49
48
|
POLISH.each { |fn| rm_r fn rescue nil }
|
50
49
|
end
|
51
50
|
|
51
|
+
desc "Run tests with Valgrind"
|
52
|
+
task :valgrind do
|
53
|
+
sh "valgrind --gen-suppressions=yes --suppressions=ferret_valgrind.supp " +
|
54
|
+
"--leak-check=yes --show-reachable=yes -v ruby test/test_all.rb"
|
55
|
+
#sh "valgrind --suppressions=ferret_valgrind.supp " +
|
56
|
+
# "--leak-check=yes --show-reachable=yes -v ruby test/unit/index/tc_index_reader.rb"
|
57
|
+
end
|
58
|
+
|
52
59
|
task :default => :test_all
|
53
|
-
|
54
|
-
|
60
|
+
#task :default => :ext do
|
61
|
+
# sh "ruby test/unit/index/tc_index.rb"
|
62
|
+
#end
|
55
63
|
|
56
|
-
desc "
|
57
|
-
task :
|
64
|
+
desc "Run all tests"
|
65
|
+
task :test_all => [ :test_units ]
|
58
66
|
|
59
|
-
desc "
|
60
|
-
|
61
|
-
t.ruby_opts = ["-r 'lib/rferret'"]
|
62
|
-
t.libs << "test/unit"
|
63
|
-
t.pattern = 'test/unit/ts_*.rb'
|
64
|
-
t.verbose = true
|
65
|
-
end
|
66
|
-
|
67
|
-
desc "run unit tests in test/unit for C ferret"
|
68
|
-
Rake::TestTask.new("test_cunits" => :ext) do |t|
|
69
|
-
t.libs << "test/unit"
|
70
|
-
t.pattern = 'test/unit/ts_*.rb'
|
71
|
-
t.verbose = true
|
72
|
-
end
|
67
|
+
desc "Generate API documentation"
|
68
|
+
task :doc => [ :appdoc ]
|
73
69
|
|
74
70
|
desc "run unit tests in test/unit"
|
75
|
-
Rake::TestTask.new("
|
76
|
-
t.libs << "test"
|
71
|
+
Rake::TestTask.new("test_units" => :ext) do |t|
|
77
72
|
t.libs << "test/unit"
|
78
|
-
t.test_files = FileList["test/longrunning/tm_store.rb"]
|
79
73
|
t.pattern = 'test/unit/t[cs]_*.rb'
|
74
|
+
#t.pattern = 'test/unit/search/tc_index_searcher.rb'
|
80
75
|
t.verbose = true
|
81
76
|
end
|
82
77
|
|
83
|
-
desc "run funtional tests in test/funtional"
|
84
|
-
Rake::TestTask.new("test_functional") do |t|
|
85
|
-
t.libs << "test"
|
86
|
-
t.pattern = 'test/funtional/tc_*.rb'
|
87
|
-
t.verbose = true
|
88
|
-
end
|
89
|
-
|
90
|
-
desc "Report code statistics (KLOCS, etc) from application"
|
91
|
-
task :stats do
|
92
|
-
CodeStatistics.new(
|
93
|
-
["Ferret", "lib/ferret"],
|
94
|
-
["Units", "test/unit"],
|
95
|
-
["Units-extended", "test/longrunning"]
|
96
|
-
).to_s
|
97
|
-
end
|
98
|
-
|
99
78
|
desc "Generate documentation for the application"
|
100
79
|
rd = Rake::RDocTask.new("appdoc") do |rdoc|
|
101
80
|
rdoc.rdoc_dir = 'doc/api'
|
102
81
|
rdoc.title = "Ferret Search Library Documentation"
|
103
|
-
rdoc.options << '--line-numbers
|
82
|
+
rdoc.options << '--line-numbers'
|
83
|
+
rdoc.options << '--inline-source'
|
84
|
+
rdoc.options << '--charset=utf-8'
|
104
85
|
rdoc.rdoc_files.include('README')
|
105
86
|
rdoc.rdoc_files.include('TODO')
|
106
87
|
rdoc.rdoc_files.include('TUTORIAL')
|
107
88
|
rdoc.rdoc_files.include('MIT-LICENSE')
|
108
89
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
90
|
+
rdoc.rdoc_files.include('ext/r_*.c')
|
91
|
+
rdoc.rdoc_files.include('ext/ferret.c')
|
109
92
|
end
|
110
93
|
|
111
94
|
EXT_SRC.each do |fn|
|
112
95
|
dest_fn = File.join("ext", File.basename(fn))
|
113
96
|
file dest_fn => fn do |t|
|
114
|
-
|
97
|
+
ln_s File.join("..", fn), dest_fn
|
115
98
|
if fn =~ /stemmer/
|
116
99
|
# flatten the directory structure for lib_stemmer
|
117
100
|
open(dest_fn) do |in_f|
|
@@ -129,7 +112,7 @@ task :ext => ["ext/#{EXT}"] + SRC
|
|
129
112
|
|
130
113
|
file "ext/#{EXT}" => ["ext/Makefile"] do
|
131
114
|
cp "ext/inc/lang.h", "ext/lang.h"
|
132
|
-
cp "ext/inc/
|
115
|
+
cp "ext/inc/threading.h", "ext/threading.h"
|
133
116
|
cd "ext"
|
134
117
|
if (/mswin/ =~ RUBY_PLATFORM) and ENV['make'].nil?
|
135
118
|
sh "nmake"
|
@@ -140,10 +123,12 @@ file "ext/#{EXT}" => ["ext/Makefile"] do
|
|
140
123
|
end
|
141
124
|
|
142
125
|
file "ext/lang.h" => ["ext/inc/lang.h"] do
|
126
|
+
rm_f "ext/lang.h"
|
143
127
|
cp "ext/inc/lang.h", "ext/lang.h"
|
144
128
|
end
|
145
|
-
file "ext/
|
146
|
-
|
129
|
+
file "ext/threading.h" => ["ext/inc/threading.h"] do
|
130
|
+
rm_f "ext/threading.h"
|
131
|
+
cp "ext/inc/threading.h", "ext/threading.h"
|
147
132
|
end
|
148
133
|
|
149
134
|
file "ext/Makefile" => SRC do
|
@@ -175,7 +160,6 @@ PKG_FILES = FileList[
|
|
175
160
|
'Rakefile'
|
176
161
|
]
|
177
162
|
PKG_FILES.exclude('**/*.o')
|
178
|
-
PKG_FILES.include('ext/termdocs.c')
|
179
163
|
PKG_FILES.exclude('**/Makefile')
|
180
164
|
PKG_FILES.exclude('ext/ferret_ext.so')
|
181
165
|
|
@@ -213,6 +197,7 @@ else
|
|
213
197
|
s.require_path = 'lib' # Use these for libraries.
|
214
198
|
s.autorequire = 'ferret'
|
215
199
|
|
200
|
+
|
216
201
|
#s.bindir = "bin" # Use these for applications.
|
217
202
|
#s.executables = ["rake"]
|
218
203
|
#s.default_executable = "rake"
|
@@ -319,11 +304,10 @@ task :update_version => [:prerelease] do
|
|
319
304
|
else
|
320
305
|
announce "Updating Ferret version to #{PKG_VERSION}"
|
321
306
|
reversion("lib/ferret.rb")
|
322
|
-
reversion("lib/rferret.rb")
|
323
307
|
if ENV['RELTEST']
|
324
308
|
announce "Release Task Testing, skipping commiting of new version"
|
325
309
|
else
|
326
|
-
sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/
|
310
|
+
sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/ferret.rb}
|
327
311
|
end
|
328
312
|
end
|
329
313
|
end
|
data/TODO
CHANGED
@@ -1,17 +1,14 @@
|
|
1
|
-
=
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
*
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
*
|
14
|
-
* Add
|
15
|
-
* Multi Field Query
|
16
|
-
* Test threading
|
17
|
-
* Compile a proper dummy executable
|
1
|
+
= TODO
|
2
|
+
|
3
|
+
* user defined sorting
|
4
|
+
* add field compression
|
5
|
+
* Fix highlighting to work for compressed fields
|
6
|
+
* Fix highlighting to work for external fields
|
7
|
+
* Add Ferret::Index::Index
|
8
|
+
|
9
|
+
= Done
|
10
|
+
* Add string Sort descripter
|
11
|
+
* fix memory bug
|
12
|
+
* add MultiReader interface
|
13
|
+
* add lexicographical sort (byte sort)
|
14
|
+
* Add highlighting
|
data/ext/analysis.c
CHANGED
@@ -1,90 +1,95 @@
|
|
1
1
|
#include "analysis.h"
|
2
2
|
#include "hash.h"
|
3
|
-
#include
|
3
|
+
#include <libstemmer.h>
|
4
4
|
#include <string.h>
|
5
5
|
#include <ctype.h>
|
6
6
|
#include <wctype.h>
|
7
7
|
#include <wchar.h>
|
8
8
|
|
9
|
-
|
10
9
|
/****************************************************************************
|
11
10
|
*
|
12
11
|
* Token
|
13
12
|
*
|
14
13
|
****************************************************************************/
|
15
14
|
|
16
|
-
Token *
|
15
|
+
inline Token *tk_set(Token *tk,
|
16
|
+
char *text, int tlen, int start, int end, int pos_inc)
|
17
17
|
{
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
18
|
+
if (tlen >= MAX_WORD_SIZE) {
|
19
|
+
tlen = MAX_WORD_SIZE - 1;
|
20
|
+
}
|
21
|
+
memcpy(tk->text, text, sizeof(char) * tlen);
|
22
|
+
tk->text[tlen] = '\0';
|
23
|
+
tk->len = tlen;
|
24
|
+
tk->start = start;
|
25
|
+
tk->end = end;
|
26
|
+
tk->pos_inc = pos_inc;
|
27
|
+
return tk;
|
24
28
|
}
|
25
29
|
|
26
|
-
inline Token *
|
27
|
-
|
28
|
-
int tlen,
|
29
|
-
int start,
|
30
|
-
int end,
|
31
|
-
int pos_inc)
|
30
|
+
inline Token *tk_set_ts(Token *tk,
|
31
|
+
char *start, char *end, char *text, int pos_inc)
|
32
32
|
{
|
33
|
-
|
34
|
-
|
35
|
-
}
|
36
|
-
memcpy(tk->text, text, sizeof(char) * tlen);
|
37
|
-
tk->text[tlen] = '\0';
|
38
|
-
tk->start = start;
|
39
|
-
tk->end = end;
|
40
|
-
tk->pos_inc = pos_inc;
|
41
|
-
return tk;
|
33
|
+
return tk_set(tk, start, (int)(end - start),
|
34
|
+
(int)(start - text), (int)(end - text), pos_inc);
|
42
35
|
}
|
43
36
|
|
44
|
-
inline Token *
|
45
|
-
|
46
|
-
char *end,
|
47
|
-
char *text,
|
48
|
-
int pos_inc)
|
37
|
+
inline Token *tk_set_no_len(Token *tk,
|
38
|
+
char *text, int start, int end, int pos_inc)
|
49
39
|
{
|
50
|
-
|
51
|
-
(int)(start - text), (int)(end - text), pos_inc);
|
40
|
+
return tk_set(tk, text, (int)strlen(text), start, end, pos_inc);
|
52
41
|
}
|
53
42
|
|
54
|
-
inline Token *
|
55
|
-
|
56
|
-
int start,
|
57
|
-
int end,
|
58
|
-
int pos_inc)
|
43
|
+
inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end,
|
44
|
+
int pos_inc)
|
59
45
|
{
|
60
|
-
|
46
|
+
int len = wcstombs(tk->text, text, MAX_WORD_SIZE - 1);
|
47
|
+
tk->text[len] = '\0';
|
48
|
+
tk->len = len;
|
49
|
+
tk->start = start;
|
50
|
+
tk->end = end;
|
51
|
+
tk->pos_inc = pos_inc;
|
52
|
+
return tk;
|
61
53
|
}
|
62
54
|
|
63
55
|
int tk_eq(Token *tk1, Token *tk2)
|
64
56
|
{
|
65
|
-
|
66
|
-
|
57
|
+
return (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
|
58
|
+
tk1->start == tk2->start && tk1->end == tk2->end);
|
67
59
|
}
|
68
60
|
|
69
61
|
int tk_cmp(Token *tk1, Token *tk2)
|
70
62
|
{
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
if (tk1->end > tk2->end) {
|
78
|
-
cmp = 1;
|
79
|
-
} else if (tk1->end < tk2->end) {
|
80
|
-
cmp = -1;
|
81
|
-
} else {
|
82
|
-
cmp = strcmp((char *)tk1->text, (char *)tk2->text);
|
63
|
+
int cmp;
|
64
|
+
if (tk1->start > tk2->start) {
|
65
|
+
cmp = 1;
|
66
|
+
}
|
67
|
+
else if (tk1->start < tk2->start) {
|
68
|
+
cmp = -1;
|
83
69
|
}
|
84
|
-
|
85
|
-
|
70
|
+
else {
|
71
|
+
if (tk1->end > tk2->end) {
|
72
|
+
cmp = 1;
|
73
|
+
}
|
74
|
+
else if (tk1->end < tk2->end) {
|
75
|
+
cmp = -1;
|
76
|
+
}
|
77
|
+
else {
|
78
|
+
cmp = strcmp((char *)tk1->text, (char *)tk2->text);
|
79
|
+
}
|
80
|
+
}
|
81
|
+
return cmp;
|
82
|
+
}
|
83
|
+
|
84
|
+
void tk_destroy(void *p)
|
85
|
+
{
|
86
|
+
free(p);
|
86
87
|
}
|
87
88
|
|
89
|
+
Token *tk_new()
|
90
|
+
{
|
91
|
+
return ALLOC(Token);
|
92
|
+
}
|
88
93
|
|
89
94
|
/****************************************************************************
|
90
95
|
*
|
@@ -92,92 +97,94 @@ int tk_cmp(Token *tk1, Token *tk2)
|
|
92
97
|
*
|
93
98
|
****************************************************************************/
|
94
99
|
|
95
|
-
void ts_deref(
|
100
|
+
void ts_deref(TokenStream *ts)
|
96
101
|
{
|
97
|
-
|
98
|
-
|
102
|
+
if (--ts->ref_cnt <= 0) {
|
103
|
+
ts->destroy_i(ts);
|
104
|
+
}
|
99
105
|
}
|
100
106
|
|
101
|
-
|
107
|
+
static TokenStream *ts_reset(TokenStream *ts, char *text)
|
102
108
|
{
|
103
|
-
|
104
|
-
|
109
|
+
ts->t = ts->text = text;
|
110
|
+
return ts;
|
105
111
|
}
|
106
112
|
|
107
|
-
|
113
|
+
TokenStream *ts_clone_size(TokenStream *orig_ts, size_t size)
|
108
114
|
{
|
109
|
-
|
115
|
+
TokenStream *ts = (TokenStream *)ecalloc(size);
|
116
|
+
memcpy(ts, orig_ts, size);
|
117
|
+
ts->ref_cnt = 1;
|
118
|
+
return ts;
|
110
119
|
}
|
111
120
|
|
112
|
-
TokenStream *
|
121
|
+
TokenStream *ts_new_i(size_t size)
|
113
122
|
{
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
123
|
+
TokenStream *ts = ecalloc(size);
|
124
|
+
|
125
|
+
ts->destroy_i = (void (*)(TokenStream *))&free;
|
126
|
+
ts->reset = &ts_reset;
|
127
|
+
ts->ref_cnt = 1;
|
128
|
+
|
129
|
+
return ts;
|
120
130
|
}
|
121
131
|
|
122
|
-
|
132
|
+
/****************************************************************************
|
133
|
+
* CachedTokenStream
|
134
|
+
****************************************************************************/
|
135
|
+
|
136
|
+
#define CTS(token_stream) ((CachedTokenStream *)(token_stream))
|
137
|
+
|
138
|
+
static TokenStream *cts_clone_i(TokenStream *orig_ts)
|
123
139
|
{
|
124
|
-
|
125
|
-
memcpy(ts, orig_ts, sizeof(TokenStream));
|
126
|
-
if (orig_ts->token) {
|
127
|
-
ts->token = ALLOC(Token);
|
128
|
-
memcpy(ts->token, orig_ts->token, sizeof(Token));
|
129
|
-
}
|
130
|
-
if (orig_ts->sub_ts) ts->sub_ts = ts_clone(orig_ts->sub_ts);
|
131
|
-
if (orig_ts->clone_i) orig_ts->clone_i(orig_ts, ts);
|
132
|
-
ts->ref_cnt = 1;
|
133
|
-
return ts;
|
140
|
+
return ts_clone_size(orig_ts, sizeof(CachedTokenStream));
|
134
141
|
}
|
135
142
|
|
136
|
-
|
137
|
-
static char * const ENC_ERR_MSG = "Error decoding input string. "
|
138
|
-
"Check that you have the locale set correctly";
|
139
|
-
#define MB_NEXT_CHAR \
|
140
|
-
if ((i = (int)mbrtowc(&wchr, t, MB_CUR_MAX, (mbstate_t *)ts->data)) < 0)\
|
141
|
-
RAISE(IO_ERROR, ENC_ERR_MSG)
|
142
|
-
|
143
|
-
inline Token *w_tk_set(Token *tk, wchar_t *text, int start, int end, int pos_inc)
|
143
|
+
static TokenStream *cts_new()
|
144
144
|
{
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
tk->pos_inc = pos_inc;
|
149
|
-
return tk;
|
145
|
+
TokenStream *ts = ts_new(CachedTokenStream);
|
146
|
+
ts->clone_i = &cts_clone_i;
|
147
|
+
return ts;
|
150
148
|
}
|
151
149
|
|
152
|
-
|
150
|
+
/* * Multi-byte TokenStream * */
|
151
|
+
|
152
|
+
#define MBTS(token_stream) ((MultiByteTokenStream *)(token_stream))
|
153
|
+
|
154
|
+
inline int mb_next_char(wchar_t *wchr, const char *s, mbstate_t *state)
|
153
155
|
{
|
154
|
-
|
155
|
-
|
156
|
-
|
156
|
+
int num_bytes;
|
157
|
+
if ((num_bytes = (int)mbrtowc(wchr, s, MB_CUR_MAX, state)) < 0) {
|
158
|
+
const char *t = s;
|
159
|
+
do {
|
160
|
+
t++;
|
161
|
+
ZEROSET(state, mbstate_t);
|
162
|
+
num_bytes = (int)mbrtowc(wchr, t, MB_CUR_MAX, state);
|
163
|
+
} while ((num_bytes < 0) && (*wchr != 0) && (*t != 0));
|
164
|
+
num_bytes += t - s;
|
165
|
+
}
|
166
|
+
return num_bytes;
|
157
167
|
}
|
158
168
|
|
159
|
-
|
169
|
+
static TokenStream *mb_ts_reset(TokenStream *ts, char *text)
|
160
170
|
{
|
161
|
-
|
162
|
-
|
171
|
+
ZEROSET(&(MBTS(ts)->state), mbstate_t);
|
172
|
+
ts_reset(ts, text);
|
173
|
+
return ts;
|
163
174
|
}
|
164
175
|
|
165
|
-
|
176
|
+
static TokenStream *mb_ts_clone_i(TokenStream *orig_ts)
|
166
177
|
{
|
167
|
-
|
168
|
-
memcpy(new_ts->data, orig_ts->data, sizeof(mbstate_t));
|
178
|
+
return ts_clone_size(orig_ts, sizeof(MultiByteTokenStream));
|
169
179
|
}
|
170
180
|
|
171
|
-
TokenStream *
|
181
|
+
TokenStream *mb_ts_new()
|
172
182
|
{
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
ts->clone_i = &mb_ts_clone_i;
|
179
|
-
ts->ref_cnt = 1;
|
180
|
-
return ts;
|
183
|
+
TokenStream *ts = ts_new(MultiByteTokenStream);
|
184
|
+
ts->reset = &mb_ts_reset;
|
185
|
+
ts->clone_i = &mb_ts_clone_i;
|
186
|
+
ts->ref_cnt = 1;
|
187
|
+
return ts;
|
181
188
|
}
|
182
189
|
|
183
190
|
/****************************************************************************
|
@@ -186,35 +193,40 @@ TokenStream *mb_ts_create()
|
|
186
193
|
*
|
187
194
|
****************************************************************************/
|
188
195
|
|
189
|
-
void a_deref(
|
196
|
+
void a_deref(Analyzer *a)
|
190
197
|
{
|
191
|
-
|
192
|
-
|
198
|
+
if (--a->ref_cnt <= 0) {
|
199
|
+
a->destroy_i(a);
|
200
|
+
}
|
193
201
|
}
|
194
202
|
|
195
|
-
void
|
203
|
+
static void a_standard_destroy_i(Analyzer *a)
|
196
204
|
{
|
197
|
-
|
198
|
-
|
205
|
+
if (a->current_ts) {
|
206
|
+
ts_deref(a->current_ts);
|
207
|
+
}
|
208
|
+
free(a);
|
199
209
|
}
|
200
210
|
|
201
|
-
TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
|
211
|
+
static TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
|
202
212
|
{
|
203
|
-
|
204
|
-
|
213
|
+
TokenStream *ts;
|
214
|
+
(void)field;
|
215
|
+
ts = ts_clone(a->current_ts);
|
216
|
+
return ts->reset(ts, text);
|
205
217
|
}
|
206
218
|
|
207
|
-
Analyzer *
|
208
|
-
|
209
|
-
|
219
|
+
Analyzer *analyzer_new(TokenStream *ts,
|
220
|
+
void (*destroy_i)(Analyzer *a),
|
221
|
+
TokenStream *(*get_ts)(Analyzer *a, char *field,
|
222
|
+
char *text))
|
210
223
|
{
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
return a;
|
224
|
+
Analyzer *a = ALLOC(Analyzer);
|
225
|
+
a->current_ts = ts;
|
226
|
+
a->destroy_i = (destroy_i ? destroy_i : &a_standard_destroy_i);
|
227
|
+
a->get_ts = (get_ts ? get_ts : &a_standard_get_ts);
|
228
|
+
a->ref_cnt = 1;
|
229
|
+
return a;
|
218
230
|
}
|
219
231
|
|
220
232
|
/****************************************************************************
|
@@ -226,120 +238,132 @@ Analyzer *analyzer_create(void *data, TokenStream *ts,
|
|
226
238
|
/*
|
227
239
|
* WhitespaceTokenizer
|
228
240
|
*/
|
229
|
-
Token *wst_next(TokenStream *ts)
|
241
|
+
static Token *wst_next(TokenStream *ts)
|
230
242
|
{
|
231
|
-
|
232
|
-
|
243
|
+
char *t = ts->t;
|
244
|
+
char *start;
|
233
245
|
|
234
|
-
|
246
|
+
while (*t != '\0' && isspace(*t)) {
|
247
|
+
t++;
|
248
|
+
}
|
235
249
|
|
236
|
-
|
250
|
+
if (*t == '\0') {
|
251
|
+
return NULL;
|
252
|
+
}
|
237
253
|
|
238
|
-
|
239
|
-
|
254
|
+
start = t;
|
255
|
+
while (*t != '\0' && !isspace(*t)) {
|
256
|
+
t++;
|
257
|
+
}
|
240
258
|
|
241
|
-
|
242
|
-
|
243
|
-
return ts->token;
|
259
|
+
ts->t = t;
|
260
|
+
return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
244
261
|
}
|
245
262
|
|
246
|
-
TokenStream *
|
263
|
+
TokenStream *whitespace_tokenizer_new()
|
247
264
|
{
|
248
|
-
|
249
|
-
|
250
|
-
|
265
|
+
TokenStream *ts = cts_new();
|
266
|
+
ts->next = &wst_next;
|
267
|
+
return ts;
|
251
268
|
}
|
252
269
|
|
253
270
|
/*
|
254
271
|
* Multi-byte WhitespaceTokenizer
|
255
272
|
*/
|
256
|
-
Token *mb_wst_next(TokenStream *ts)
|
257
|
-
{
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
273
|
+
static Token *mb_wst_next(TokenStream *ts)
|
274
|
+
{
|
275
|
+
int i;
|
276
|
+
char *start;
|
277
|
+
char *t = ts->t;
|
278
|
+
wchar_t wchr;
|
279
|
+
mbstate_t *state = &(MBTS(ts)->state);
|
280
|
+
|
281
|
+
i = mb_next_char(&wchr, t, state);
|
282
|
+
while (wchr != 0 && iswspace(wchr)) {
|
283
|
+
t += i;
|
284
|
+
i = mb_next_char(&wchr, t, state);
|
285
|
+
}
|
286
|
+
if (wchr == 0) {
|
287
|
+
return NULL;
|
288
|
+
}
|
262
289
|
|
263
|
-
|
264
|
-
while (wchr != 0 && iswspace(wchr)) {
|
290
|
+
start = t;
|
265
291
|
t += i;
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
while (wchr != 0 && !iswspace(wchr)) {
|
274
|
-
t += i;
|
275
|
-
MB_NEXT_CHAR;
|
276
|
-
}
|
277
|
-
tk_set_ts(ts->token, start, t, ts->text, 1);
|
278
|
-
ts->t = t;
|
279
|
-
return ts->token;
|
292
|
+
i = mb_next_char(&wchr, t, state);
|
293
|
+
while (wchr != 0 && !iswspace(wchr)) {
|
294
|
+
t += i;
|
295
|
+
i = mb_next_char(&wchr, t, state);
|
296
|
+
}
|
297
|
+
ts->t = t;
|
298
|
+
return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
280
299
|
}
|
281
300
|
|
282
301
|
/*
|
283
302
|
* Lowercasing Multi-byte WhitespaceTokenizer
|
284
303
|
*/
|
285
|
-
Token *mb_wst_next_lc(TokenStream *ts)
|
286
|
-
{
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
304
|
+
static Token *mb_wst_next_lc(TokenStream *ts)
|
305
|
+
{
|
306
|
+
int i;
|
307
|
+
char *start;
|
308
|
+
char *t = ts->t;
|
309
|
+
wchar_t wchr;
|
310
|
+
wchar_t wbuf[MAX_WORD_SIZE + 1], *w, *w_end;
|
311
|
+
mbstate_t *state = &(MBTS(ts)->state);
|
312
|
+
|
313
|
+
w = wbuf;
|
314
|
+
w_end = &wbuf[MAX_WORD_SIZE];
|
315
|
+
|
316
|
+
i = mb_next_char(&wchr, t, state);
|
317
|
+
while (wchr != 0 && iswspace(wchr)) {
|
318
|
+
t += i;
|
319
|
+
i = mb_next_char(&wchr, t, state);
|
320
|
+
}
|
321
|
+
if (wchr == 0) {
|
322
|
+
return NULL;
|
323
|
+
}
|
295
324
|
|
296
|
-
|
297
|
-
while (wchr != 0 && iswspace(wchr)) {
|
325
|
+
start = t;
|
298
326
|
t += i;
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
t
|
310
|
-
|
311
|
-
|
312
|
-
*w = 0;
|
313
|
-
w_tk_set(ts->token, wbuf, (int)(start - ts->text), (int)(t - ts->text), 1);
|
314
|
-
ts->t = t;
|
315
|
-
return ts->token;
|
327
|
+
*w++ = towlower(wchr);
|
328
|
+
i = mb_next_char(&wchr, t, state);
|
329
|
+
while (wchr != 0 && !iswspace(wchr)) {
|
330
|
+
if (w < w_end) {
|
331
|
+
*w++ = towlower(wchr);
|
332
|
+
}
|
333
|
+
t += i;
|
334
|
+
i = mb_next_char(&wchr, t, state);
|
335
|
+
}
|
336
|
+
*w = 0;
|
337
|
+
ts->t = t;
|
338
|
+
return w_tk_set(&(CTS(ts)->token), wbuf, (int)(start - ts->text),
|
339
|
+
(int)(t - ts->text), 1);
|
316
340
|
}
|
317
341
|
|
318
|
-
TokenStream *
|
342
|
+
TokenStream *mb_whitespace_tokenizer_new(bool lowercase)
|
319
343
|
{
|
320
|
-
|
321
|
-
|
322
|
-
|
344
|
+
TokenStream *ts = mb_ts_new();
|
345
|
+
ts->next = lowercase ? &mb_wst_next_lc : &mb_wst_next;
|
346
|
+
return ts;
|
323
347
|
}
|
324
348
|
|
325
349
|
/*
|
326
350
|
* WhitespaceAnalyzers
|
327
351
|
*/
|
328
|
-
Analyzer *
|
352
|
+
Analyzer *whitespace_analyzer_new(bool lowercase)
|
329
353
|
{
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
354
|
+
TokenStream *ts;
|
355
|
+
if (lowercase) {
|
356
|
+
ts = lowercase_filter_new(whitespace_tokenizer_new());
|
357
|
+
}
|
358
|
+
else {
|
359
|
+
ts = whitespace_tokenizer_new();
|
360
|
+
}
|
361
|
+
return analyzer_new(ts, NULL, NULL);
|
337
362
|
}
|
338
363
|
|
339
|
-
Analyzer *
|
364
|
+
Analyzer *mb_whitespace_analyzer_new(bool lowercase)
|
340
365
|
{
|
341
|
-
|
342
|
-
NULL, NULL);
|
366
|
+
return analyzer_new(mb_whitespace_tokenizer_new(lowercase), NULL, NULL);
|
343
367
|
}
|
344
368
|
|
345
369
|
/****************************************************************************
|
@@ -353,26 +377,31 @@ Analyzer *mb_whitespace_analyzer_create(bool lowercase)
|
|
353
377
|
*/
|
354
378
|
Token *lt_next(TokenStream *ts)
|
355
379
|
{
|
356
|
-
|
357
|
-
|
380
|
+
char *start;
|
381
|
+
char *t = ts->t;
|
358
382
|
|
359
|
-
|
383
|
+
while (*t != '\0' && !isalpha(*t)) {
|
384
|
+
t++;
|
385
|
+
}
|
360
386
|
|
361
|
-
|
387
|
+
if (*t == '\0') {
|
388
|
+
return NULL;
|
389
|
+
}
|
362
390
|
|
363
|
-
|
364
|
-
|
391
|
+
start = t;
|
392
|
+
while (*t != '\0' && isalpha(*t)) {
|
393
|
+
t++;
|
394
|
+
}
|
365
395
|
|
366
|
-
|
367
|
-
|
368
|
-
return ts->token;
|
396
|
+
ts->t = t;
|
397
|
+
return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
369
398
|
}
|
370
399
|
|
371
|
-
TokenStream *
|
400
|
+
TokenStream *letter_tokenizer_new()
|
372
401
|
{
|
373
|
-
|
374
|
-
|
375
|
-
|
402
|
+
TokenStream *ts = cts_new();
|
403
|
+
ts->next = <_next;
|
404
|
+
return ts;
|
376
405
|
}
|
377
406
|
|
378
407
|
/*
|
@@ -380,28 +409,31 @@ TokenStream *letter_tokenizer_create()
|
|
380
409
|
*/
|
381
410
|
Token *mb_lt_next(TokenStream *ts)
|
382
411
|
{
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
412
|
+
int i;
|
413
|
+
char *start;
|
414
|
+
char *t = ts->t;
|
415
|
+
wchar_t wchr;
|
416
|
+
mbstate_t *state = &(MBTS(ts)->state);
|
387
417
|
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
418
|
+
i = mb_next_char(&wchr, t, state);
|
419
|
+
while (wchr != 0 && !iswalpha(wchr)) {
|
420
|
+
t += i;
|
421
|
+
i = mb_next_char(&wchr, t, state);
|
422
|
+
}
|
423
|
+
|
424
|
+
if (wchr == 0) {
|
425
|
+
return NULL;
|
426
|
+
}
|
427
|
+
|
428
|
+
start = t;
|
399
429
|
t += i;
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
430
|
+
i = mb_next_char(&wchr, t, state);
|
431
|
+
while (wchr != 0 && iswalpha(wchr)) {
|
432
|
+
t += i;
|
433
|
+
i = mb_next_char(&wchr, t, state);
|
434
|
+
}
|
435
|
+
ts->t = t;
|
436
|
+
return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
405
437
|
}
|
406
438
|
|
407
439
|
/*
|
@@ -409,62 +441,67 @@ Token *mb_lt_next(TokenStream *ts)
|
|
409
441
|
*/
|
410
442
|
Token *mb_lt_next_lc(TokenStream *ts)
|
411
443
|
{
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
444
|
+
int i;
|
445
|
+
char *start;
|
446
|
+
char *t = ts->t;
|
447
|
+
wchar_t wchr;
|
448
|
+
wchar_t wbuf[MAX_WORD_SIZE + 1], *w, *w_end;
|
449
|
+
mbstate_t *state = &(MBTS(ts)->state);
|
417
450
|
|
418
|
-
|
419
|
-
|
451
|
+
w = wbuf;
|
452
|
+
w_end = &wbuf[MAX_WORD_SIZE];
|
420
453
|
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
MB_NEXT_CHAR;
|
432
|
-
while (wchr != 0 && iswalpha(wchr)) {
|
433
|
-
if (w < w_end) *w++ = towlower(wchr);
|
454
|
+
i = mb_next_char(&wchr, t, state);
|
455
|
+
while (wchr != 0 && !iswalpha(wchr)) {
|
456
|
+
t += i;
|
457
|
+
i = mb_next_char(&wchr, t, state);
|
458
|
+
}
|
459
|
+
if (wchr == 0) {
|
460
|
+
return NULL;
|
461
|
+
}
|
462
|
+
|
463
|
+
start = t;
|
434
464
|
t += i;
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
465
|
+
*w++ = towlower(wchr);
|
466
|
+
i = mb_next_char(&wchr, t, state);
|
467
|
+
while (wchr != 0 && iswalpha(wchr)) {
|
468
|
+
if (w < w_end) {
|
469
|
+
*w++ = towlower(wchr);
|
470
|
+
}
|
471
|
+
t += i;
|
472
|
+
i = mb_next_char(&wchr, t, state);
|
473
|
+
}
|
474
|
+
*w = 0;
|
475
|
+
ts->t = t;
|
476
|
+
return w_tk_set(&(CTS(ts)->token), wbuf, (int)(start - ts->text),
|
477
|
+
(int)(t - ts->text), 1);
|
441
478
|
}
|
442
479
|
|
443
|
-
TokenStream *
|
480
|
+
TokenStream *mb_letter_tokenizer_new(bool lowercase)
|
444
481
|
{
|
445
|
-
|
446
|
-
|
447
|
-
|
482
|
+
TokenStream *ts = mb_ts_new();
|
483
|
+
ts->next = lowercase ? &mb_lt_next_lc : &mb_lt_next;
|
484
|
+
return ts;
|
448
485
|
}
|
449
486
|
|
450
487
|
/*
|
451
488
|
* LetterAnalyzers
|
452
489
|
*/
|
453
|
-
Analyzer *
|
490
|
+
Analyzer *letter_analyzer_new(bool lowercase)
|
454
491
|
{
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
492
|
+
TokenStream *ts;
|
493
|
+
if (lowercase) {
|
494
|
+
ts = lowercase_filter_new(letter_tokenizer_new());
|
495
|
+
}
|
496
|
+
else {
|
497
|
+
ts = letter_tokenizer_new();
|
498
|
+
}
|
499
|
+
return analyzer_new(ts, NULL, NULL);
|
462
500
|
}
|
463
501
|
|
464
|
-
Analyzer *
|
502
|
+
Analyzer *mb_letter_analyzer_new(bool lowercase)
|
465
503
|
{
|
466
|
-
|
467
|
-
mb_letter_tokenizer_create(lowercase), NULL, NULL);
|
504
|
+
return analyzer_new(mb_letter_tokenizer_new(lowercase), NULL, NULL);
|
468
505
|
}
|
469
506
|
|
470
507
|
/****************************************************************************
|
@@ -473,115 +510,146 @@ Analyzer *mb_letter_analyzer_create(bool lowercase)
|
|
473
510
|
*
|
474
511
|
****************************************************************************/
|
475
512
|
|
513
|
+
#define STDTS(token_stream) ((StandardTokenizer *)(token_stream))
|
514
|
+
|
476
515
|
/*
|
477
516
|
* StandardTokenizer
|
478
517
|
*/
|
479
|
-
int std_get_alpha(TokenStream *ts, char *token)
|
518
|
+
static int std_get_alpha(TokenStream *ts, char *token)
|
480
519
|
{
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
520
|
+
int i = 0;
|
521
|
+
char *t = ts->t;
|
522
|
+
while (t[i] != '\0' && isalpha(t[i])) {
|
523
|
+
if (i < MAX_WORD_SIZE) {
|
524
|
+
token[i] = t[i];
|
525
|
+
}
|
526
|
+
i++;
|
527
|
+
}
|
528
|
+
return i;
|
488
529
|
}
|
489
530
|
|
490
|
-
int mb_std_get_alpha(TokenStream *ts, char *token)
|
531
|
+
static int mb_std_get_alpha(TokenStream *ts, char *token)
|
491
532
|
{
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
t
|
498
|
-
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
499
|
-
}
|
533
|
+
char *t = ts->t;
|
534
|
+
wchar_t wchr;
|
535
|
+
int i;
|
536
|
+
mbstate_t state; ZEROSET(&state, mbstate_t);
|
537
|
+
|
538
|
+
i = mb_next_char(&wchr, t, &state);
|
500
539
|
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
540
|
+
while (wchr != 0 && iswalpha(wchr)) {
|
541
|
+
t += i;
|
542
|
+
i = mb_next_char(&wchr, t, &state);
|
543
|
+
}
|
544
|
+
|
545
|
+
i = (int)(t - ts->t);
|
546
|
+
if (i > MAX_WORD_SIZE) {
|
547
|
+
i = MAX_WORD_SIZE - 1;
|
548
|
+
}
|
549
|
+
memcpy(token, ts->t, i);
|
550
|
+
return i;
|
505
551
|
}
|
506
552
|
|
507
|
-
|
553
|
+
/*
|
554
|
+
static int std_get_alnum(TokenStream *ts, char *token)
|
508
555
|
{
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
556
|
+
int i = 0;
|
557
|
+
char *t = ts->t;
|
558
|
+
while (t[i] != '\0' && isalnum(t[i])) {
|
559
|
+
if (i < MAX_WORD_SIZE) {
|
560
|
+
token[i] = t[i];
|
561
|
+
}
|
562
|
+
i++;
|
563
|
+
}
|
564
|
+
return i;
|
516
565
|
}
|
517
566
|
|
518
|
-
int mb_std_get_alnum(
|
567
|
+
static int mb_std_get_alnum(TokenStream *ts, char *token)
|
519
568
|
{
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
while (w != 0 && iswalnum(w)) {
|
525
|
-
t += i;
|
526
|
-
if ((i = mbtowc(&w, t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
527
|
-
}
|
569
|
+
char *t = ts->t;
|
570
|
+
wchar_t wchr;
|
571
|
+
int i;
|
572
|
+
mbstate_t state; ZEROSET(&state, mbstate_t);
|
528
573
|
|
529
|
-
|
530
|
-
if (i > MAX_WORD_SIZE) i = MAX_WORD_SIZE - 1;
|
531
|
-
memcpy(token, ts->t, i);
|
532
|
-
return i;
|
574
|
+
i = mb_next_char(&wchr, t, &state);
|
533
575
|
|
576
|
+
while (wchr != 0 && iswalnum(wchr)) {
|
577
|
+
t += i;
|
578
|
+
i = mb_next_char(&wchr, t, &state);
|
579
|
+
}
|
580
|
+
|
581
|
+
i = (int)(t - ts->t);
|
582
|
+
if (i > MAX_WORD_SIZE) {
|
583
|
+
i = MAX_WORD_SIZE - 1;
|
584
|
+
}
|
585
|
+
memcpy(token, ts->t, i);
|
586
|
+
return i;
|
534
587
|
}
|
588
|
+
*/
|
535
589
|
|
536
|
-
int isnumpunc(char c)
|
590
|
+
static int isnumpunc(char c)
|
537
591
|
{
|
538
|
-
|
592
|
+
return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_'
|
593
|
+
|| c == '-');
|
539
594
|
}
|
540
595
|
|
541
|
-
int w_isnumpunc(wchar_t c)
|
596
|
+
static int w_isnumpunc(wchar_t c)
|
542
597
|
{
|
543
|
-
|
598
|
+
return (c == L'.' || c == L',' || c == L'\\' || c == L'/' || c == L'_'
|
599
|
+
|| c == L'-');
|
544
600
|
}
|
545
601
|
|
546
|
-
int isurlpunc(char c)
|
602
|
+
static int isurlpunc(char c)
|
547
603
|
{
|
548
|
-
|
604
|
+
return (c == '.' || c == '/' || c == '-' || c == '_');
|
549
605
|
}
|
550
606
|
|
551
|
-
int isurlc(char c)
|
607
|
+
static int isurlc(char c)
|
552
608
|
{
|
553
|
-
|
609
|
+
return (c == '.' || c == '/' || c == '-' || c == '_' || isalnum(c));
|
554
610
|
}
|
555
611
|
|
556
|
-
int isurlxatpunc(char c)
|
612
|
+
static int isurlxatpunc(char c)
|
557
613
|
{
|
558
|
-
|
614
|
+
return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@');
|
559
615
|
}
|
560
616
|
|
561
|
-
int isurlxatc(char c)
|
617
|
+
static int isurlxatc(char c)
|
562
618
|
{
|
563
|
-
|
619
|
+
return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@'
|
620
|
+
|| isalnum(c));
|
564
621
|
}
|
565
622
|
|
566
|
-
bool std_is_tok_char(char *c)
|
623
|
+
static bool std_is_tok_char(char *c)
|
567
624
|
{
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
625
|
+
if (isspace(*c)) {
|
626
|
+
return false; /* most common so check first. */
|
627
|
+
}
|
628
|
+
if (isalnum(*c) || isnumpunc(*c) || *c == '&' ||
|
629
|
+
*c == '@' || *c == '\'' || *c == ':') {
|
630
|
+
return true;
|
631
|
+
}
|
632
|
+
return false;
|
573
633
|
}
|
574
634
|
|
575
|
-
bool
|
635
|
+
static bool mb_std_is_tok_char(char *t)
|
576
636
|
{
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
637
|
+
wchar_t c;
|
638
|
+
mbstate_t state; ZEROSET(&state, mbstate_t);
|
639
|
+
|
640
|
+
if (((int)mbrtowc(&c, t, MB_CUR_MAX, &state)) < 0) {
|
641
|
+
/* error which we can handle next time round. For now just return
|
642
|
+
* false so that we can return a token */
|
643
|
+
return false;
|
644
|
+
}
|
645
|
+
if (iswspace(c)) {
|
646
|
+
return false; /* most common so check first. */
|
647
|
+
}
|
648
|
+
if (iswalnum(c) || w_isnumpunc(c) || c == L'&' || c == L'@' || c == L'\''
|
649
|
+
|| c == L':') {
|
650
|
+
return true;
|
651
|
+
}
|
652
|
+
return false;
|
585
653
|
}
|
586
654
|
|
587
655
|
/* (alnum)((punc)(alnum))+ where every second sequence of alnum must contain at
|
@@ -589,583 +657,669 @@ bool w_std_is_tok_char(char *t)
|
|
589
657
|
* (alnum) = [a-zA-Z0-9]
|
590
658
|
* (punc) = [_\/.,-]
|
591
659
|
*/
|
592
|
-
int std_get_number(char *input)
|
593
|
-
{
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
660
|
+
static int std_get_number(char *input)
|
661
|
+
{
|
662
|
+
int i = 0;
|
663
|
+
int count = 0;
|
664
|
+
int last_seen_digit = 2;
|
665
|
+
int seen_digit = false;
|
666
|
+
|
667
|
+
while (last_seen_digit >= 0) {
|
668
|
+
while ((input[i] != '\0') && isalnum(input[i])) {
|
669
|
+
if ((last_seen_digit < 2) && isdigit(input[i])) {
|
670
|
+
last_seen_digit = 2;
|
671
|
+
}
|
672
|
+
if ((seen_digit == false) && isdigit(input[i])) {
|
673
|
+
seen_digit = true;
|
674
|
+
}
|
675
|
+
i++;
|
676
|
+
}
|
677
|
+
last_seen_digit--;
|
678
|
+
if (!isnumpunc(input[i]) || !isalnum(input[i + 1])) {
|
679
|
+
|
680
|
+
if (last_seen_digit >= 0) {
|
681
|
+
count = i;
|
682
|
+
}
|
683
|
+
break;
|
684
|
+
}
|
609
685
|
count = i;
|
610
|
-
|
611
|
-
}
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
return 0;
|
686
|
+
i++;
|
687
|
+
}
|
688
|
+
if (seen_digit) {
|
689
|
+
return count;
|
690
|
+
}
|
691
|
+
else {
|
692
|
+
return 0;
|
693
|
+
}
|
619
694
|
}
|
620
695
|
|
621
|
-
int std_get_apostrophe(char *input)
|
696
|
+
static int std_get_apostrophe(char *input)
|
622
697
|
{
|
623
|
-
|
698
|
+
char *t = input;
|
624
699
|
|
625
|
-
|
626
|
-
|
700
|
+
while (isalpha(*t) || *t == '\'') {
|
701
|
+
t++;
|
702
|
+
}
|
627
703
|
|
628
|
-
|
704
|
+
return (int)(t - input);
|
629
705
|
}
|
630
706
|
|
631
|
-
int mb_std_get_apostrophe(char *input)
|
707
|
+
static int mb_std_get_apostrophe(char *input)
|
632
708
|
{
|
633
|
-
|
634
|
-
|
635
|
-
|
709
|
+
char *t = input;
|
710
|
+
wchar_t wchr;
|
711
|
+
int i;
|
712
|
+
mbstate_t state; ZEROSET(&state, mbstate_t);
|
636
713
|
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
714
|
+
i = mb_next_char(&wchr, t, &state);
|
715
|
+
|
716
|
+
while (iswalpha(wchr) || wchr == L'\'') {
|
717
|
+
t += i;
|
718
|
+
i = mb_next_char(&wchr, t, &state);
|
719
|
+
}
|
720
|
+
return (int)(t - input);
|
643
721
|
}
|
644
722
|
|
645
|
-
int std_get_url(char *input, char *token, int i)
|
723
|
+
static int std_get_url(char *input, char *token, int i)
|
646
724
|
{
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
725
|
+
while (isurlc(input[i])) {
|
726
|
+
if (isurlpunc(input[i]) && isurlpunc(input[i - 1])) {
|
727
|
+
break; /* can't have to puncs in a row */
|
728
|
+
}
|
729
|
+
if (i < MAX_WORD_SIZE) {
|
730
|
+
token[i] = input[i];
|
731
|
+
}
|
732
|
+
i++;
|
733
|
+
}
|
653
734
|
|
654
|
-
|
655
|
-
|
735
|
+
/* strip trailing puncs */
|
736
|
+
while (isurlpunc(input[i - 1])) {
|
737
|
+
i--;
|
738
|
+
}
|
656
739
|
|
657
|
-
|
740
|
+
return i;
|
658
741
|
}
|
659
742
|
|
660
743
|
/* Company names can contain '@' and '&' like AT&T and Excite@Home. Let's
|
661
|
-
|
662
|
-
int std_get_company_name(char *input)
|
744
|
+
*/
|
745
|
+
static int std_get_company_name(char *input)
|
663
746
|
{
|
664
|
-
|
665
|
-
|
666
|
-
|
747
|
+
int i = 0;
|
748
|
+
while (isalpha(input[i]) || input[i] == '@' || input[i] == '&') {
|
749
|
+
i++;
|
750
|
+
}
|
667
751
|
|
668
|
-
|
752
|
+
return i;
|
669
753
|
}
|
670
754
|
|
671
|
-
|
755
|
+
/*
|
756
|
+
static int mb_std_get_company_name(char *input, TokenStream *ts)
|
672
757
|
{
|
673
|
-
|
674
|
-
|
675
|
-
|
758
|
+
char *t = input;
|
759
|
+
wchar_t wchr;
|
760
|
+
int i;
|
761
|
+
mbstate_t state; ZEROSET(&state, mbstate_t);
|
676
762
|
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
763
|
+
i = mb_next_char(&wchr, t, &state);
|
764
|
+
while (iswalpha(wchr) || wchr == L'@' || wchr == L'&') {
|
765
|
+
t += i;
|
766
|
+
i = mb_next_char(&wchr, t, &state);
|
767
|
+
}
|
682
768
|
|
683
|
-
|
769
|
+
return (int)(t - input);
|
684
770
|
}
|
771
|
+
*/
|
685
772
|
|
686
|
-
bool std_advance_to_start(TokenStream *ts)
|
773
|
+
static bool std_advance_to_start(TokenStream *ts)
|
687
774
|
{
|
688
|
-
|
689
|
-
|
775
|
+
char *t = ts->t;
|
776
|
+
while (*t != '\0' && !isalnum(*t)) {
|
777
|
+
t++;
|
778
|
+
}
|
690
779
|
|
691
|
-
|
780
|
+
ts->t = t;
|
692
781
|
|
693
|
-
|
782
|
+
return (*t != '\0');
|
694
783
|
}
|
695
784
|
|
696
|
-
bool mb_std_advance_to_start(TokenStream *ts)
|
785
|
+
static bool mb_std_advance_to_start(TokenStream *ts)
|
697
786
|
{
|
698
|
-
|
699
|
-
|
787
|
+
int i;
|
788
|
+
wchar_t wchr;
|
789
|
+
mbstate_t state; ZEROSET(&state, mbstate_t);
|
700
790
|
|
701
|
-
|
702
|
-
while (w != 0 && !iswalnum(w)) {
|
703
|
-
ts->t += i;
|
704
|
-
if ((i = mbtowc(&w, ts->t, MB_CUR_MAX)) < 0) RAISE(IO_ERROR, ENC_ERR_MSG);
|
705
|
-
}
|
791
|
+
i = mb_next_char(&wchr, ts->t, &state);
|
706
792
|
|
707
|
-
|
708
|
-
|
793
|
+
while (wchr != 0 && !iswalnum(wchr)) {
|
794
|
+
ts->t += i;
|
795
|
+
i = mb_next_char(&wchr, ts->t, &state);
|
796
|
+
}
|
709
797
|
|
710
|
-
|
711
|
-
|
712
|
-
bool (*is_tok_char)(char *c);
|
713
|
-
int (*get_alpha)(TokenStream *ts, char *token);
|
714
|
-
int (*get_apostrophe)(char *input);
|
715
|
-
} StandardTokenizer;
|
798
|
+
return (wchr != 0);
|
799
|
+
}
|
716
800
|
|
717
|
-
Token *std_next(TokenStream *ts)
|
801
|
+
static Token *std_next(TokenStream *ts)
|
718
802
|
{
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
803
|
+
StandardTokenizer *std_tz = STDTS(ts);
|
804
|
+
char *s;
|
805
|
+
char *t;
|
806
|
+
char *start = NULL;
|
807
|
+
char *num_end = NULL;
|
808
|
+
char token[MAX_WORD_SIZE];
|
809
|
+
int token_i = 0;
|
810
|
+
int len;
|
811
|
+
bool is_acronym;
|
812
|
+
bool seen_at_symbol;
|
730
813
|
|
731
|
-
if (!std_tz->advance_to_start(ts)) return NULL;
|
732
814
|
|
733
|
-
|
734
|
-
|
735
|
-
t += std_get_number(t);
|
736
|
-
ts->t = t;
|
737
|
-
tk_set_ts(ts->token, start, t, ts->text, 1);
|
738
|
-
} else {
|
739
|
-
token_i = std_tz->get_alpha(ts, token);
|
740
|
-
t += token_i;
|
741
|
-
|
742
|
-
if (!std_tz->is_tok_char(t)) {
|
743
|
-
// very common case, ie a plain word, so check and return
|
744
|
-
tk_set_ts(ts->token, start, t, ts->text, 1);
|
745
|
-
ts->t = t;
|
746
|
-
return ts->token;
|
747
|
-
}
|
748
|
-
|
749
|
-
if (*t == '\'') { // apostrophe case.
|
750
|
-
t += std_tz->get_apostrophe(t);
|
751
|
-
ts->t = t;
|
752
|
-
len = (int)(t - start);
|
753
|
-
// strip possesive
|
754
|
-
if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') t -= 2;
|
755
|
-
|
756
|
-
tk_set_ts(ts->token, start, t, ts->text, 1);
|
757
|
-
return ts->token;
|
758
|
-
}
|
759
|
-
|
760
|
-
if (*t == '&') { // apostrophe case.
|
761
|
-
t += std_get_company_name(t);
|
762
|
-
ts->t = t;
|
763
|
-
tk_set_ts(ts->token, start, t, ts->text, 1);
|
764
|
-
return ts->token;
|
765
|
-
}
|
766
|
-
|
767
|
-
if (isdigit(*t) || isnumpunc(*t)) { // possibly a number
|
768
|
-
num_end = start + std_get_number(start);
|
769
|
-
if (!std_tz->is_tok_char(num_end)) { // we won't find a longer token
|
770
|
-
ts->t = num_end;
|
771
|
-
tk_set_ts(ts->token, start, num_end, ts->text, 1);
|
772
|
-
return ts->token;
|
773
|
-
}
|
774
|
-
// else there may be a longer token so check
|
775
|
-
}
|
776
|
-
|
777
|
-
if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
|
778
|
-
// check for a known url start
|
779
|
-
token[token_i] = '\0';
|
780
|
-
t += 3;
|
781
|
-
while (*t == '/') t++;
|
782
|
-
if (isalpha(*t) &&
|
783
|
-
(memcmp(token, "ftp", 3) == 0 ||
|
784
|
-
memcmp(token, "http", 4) == 0 ||
|
785
|
-
memcmp(token, "https", 5) == 0 ||
|
786
|
-
memcmp(token, "file", 4) == 0)) {
|
787
|
-
len = std_get_url(t, token, 0); // dispose of first part of the URL
|
788
|
-
} else { //still treat as url but keep the first part
|
789
|
-
token_i = (int)(t - start);
|
790
|
-
memcpy(token, start, token_i * sizeof(char));
|
791
|
-
len = token_i + std_get_url(t, token, token_i); // keep start
|
792
|
-
}
|
793
|
-
ts->t = t + len;
|
794
|
-
token[len] = 0;
|
795
|
-
tk_set(ts->token, token, len, (int)(start - ts->text),
|
796
|
-
(int)(ts->t - ts->text), 1);
|
797
|
-
return ts->token;
|
798
|
-
}
|
799
|
-
|
800
|
-
// now see how long a url we can find.
|
801
|
-
is_acronym = true;
|
802
|
-
seen_at_symbol = false;
|
803
|
-
while (isurlxatc(*t)) {
|
804
|
-
if (is_acronym && !isalpha(*t) && (*t != '.')) {
|
805
|
-
is_acronym = false;
|
806
|
-
}
|
807
|
-
if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
|
808
|
-
break; // can't have two punctuation characters in a row
|
809
|
-
}
|
810
|
-
if (*t == '@') {
|
811
|
-
if (seen_at_symbol) {
|
812
|
-
break; // we can only have one @ symbol
|
813
|
-
} else {
|
814
|
-
seen_at_symbol = true;
|
815
|
-
}
|
816
|
-
}
|
817
|
-
t++;
|
815
|
+
if (!std_tz->advance_to_start(ts)) {
|
816
|
+
return NULL;
|
818
817
|
}
|
819
|
-
while (isurlxatpunc(t[-1])) t--; // strip trailing punctuation
|
820
818
|
|
821
|
-
|
822
|
-
|
819
|
+
start = t = ts->t;
|
820
|
+
if (isdigit(*t)) {
|
821
|
+
t += std_get_number(t);
|
822
|
+
ts->t = t;
|
823
|
+
tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
824
|
+
}
|
825
|
+
else {
|
826
|
+
token_i = std_tz->get_alpha(ts, token);
|
827
|
+
t += token_i;
|
828
|
+
|
829
|
+
if (!std_tz->is_tok_char(t)) {
|
830
|
+
/* very common case, ie a plain word, so check and return */
|
831
|
+
ts->t = t;
|
832
|
+
return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
833
|
+
}
|
834
|
+
|
835
|
+
if (*t == '\'') { /* apostrophe case. */
|
836
|
+
t += std_tz->get_apostrophe(t);
|
837
|
+
ts->t = t;
|
838
|
+
len = (int)(t - start);
|
839
|
+
/* strip possesive */
|
840
|
+
if ((t[-1] == 's' || t[-1] == 'S') && t[-2] == '\'') {
|
841
|
+
t -= 2;
|
842
|
+
}
|
823
843
|
|
824
|
-
|
825
|
-
for (s = start; s < t-1; s++) {
|
826
|
-
if (isalpha(*s) && (s[1] != '.')) is_acronym = false;
|
844
|
+
return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
827
845
|
}
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
token
|
833
|
-
|
834
|
-
|
846
|
+
|
847
|
+
if (*t == '&') { /* apostrophe case. */
|
848
|
+
t += std_get_company_name(t);
|
849
|
+
ts->t = t;
|
850
|
+
return tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
851
|
+
}
|
852
|
+
|
853
|
+
if (isdigit(*t) || isnumpunc(*t)) { /* possibly a number */
|
854
|
+
num_end = start + std_get_number(start);
|
855
|
+
if (!std_tz->is_tok_char(num_end)) { /* won't find a longer token */
|
856
|
+
ts->t = num_end;
|
857
|
+
return tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
|
858
|
+
}
|
859
|
+
/* else there may be a longer token so check */
|
860
|
+
}
|
861
|
+
|
862
|
+
if (t[0] == ':' && t[1] == '/' && t[2] == '/') {
|
863
|
+
/* check for a known url start */
|
864
|
+
token[token_i] = '\0';
|
865
|
+
t += 3;
|
866
|
+
while (*t == '/') {
|
867
|
+
t++;
|
868
|
+
}
|
869
|
+
if (isalpha(*t) &&
|
870
|
+
(memcmp(token, "ftp", 3) == 0 ||
|
871
|
+
memcmp(token, "http", 4) == 0 ||
|
872
|
+
memcmp(token, "https", 5) == 0 ||
|
873
|
+
memcmp(token, "file", 4) == 0)) {
|
874
|
+
len = std_get_url(t, token, 0); /* dispose of first part of the URL */
|
875
|
+
}
|
876
|
+
else { /* still treat as url but keep the first part */
|
877
|
+
token_i = (int)(t - start);
|
878
|
+
memcpy(token, start, token_i * sizeof(char));
|
879
|
+
len = token_i + std_get_url(t, token, token_i); /* keep start */
|
880
|
+
}
|
881
|
+
ts->t = t + len;
|
882
|
+
token[len] = 0;
|
883
|
+
return tk_set(&(CTS(ts)->token), token, len, (int)(start - ts->text),
|
884
|
+
(int)(ts->t - ts->text), 1);
|
885
|
+
}
|
886
|
+
|
887
|
+
/* now see how long a url we can find. */
|
888
|
+
is_acronym = true;
|
889
|
+
seen_at_symbol = false;
|
890
|
+
while (isurlxatc(*t)) {
|
891
|
+
if (is_acronym && !isalpha(*t) && (*t != '.')) {
|
892
|
+
is_acronym = false;
|
893
|
+
}
|
894
|
+
if (isurlxatpunc(*t) && isurlxatpunc(t[-1])) {
|
895
|
+
break; /* can't have two punctuation characters in a row */
|
896
|
+
}
|
897
|
+
if (*t == '@') {
|
898
|
+
if (seen_at_symbol) {
|
899
|
+
break; /* we can only have one @ symbol */
|
900
|
+
}
|
901
|
+
else {
|
902
|
+
seen_at_symbol = true;
|
903
|
+
}
|
904
|
+
}
|
905
|
+
t++;
|
906
|
+
}
|
907
|
+
while (isurlxatpunc(t[-1])) {
|
908
|
+
t--; /* strip trailing punctuation */
|
909
|
+
}
|
910
|
+
|
911
|
+
if (t > num_end) {
|
912
|
+
ts->t = t;
|
913
|
+
|
914
|
+
if (is_acronym) { /* check it is one letter followed by one '.' */
|
915
|
+
for (s = start; s < t - 1; s++) {
|
916
|
+
if (isalpha(*s) && (s[1] != '.'))
|
917
|
+
is_acronym = false;
|
918
|
+
}
|
919
|
+
}
|
920
|
+
if (is_acronym) { /* strip '.'s */
|
921
|
+
for (s = start + token_i; s < t; s++) {
|
922
|
+
if (*s != '.') {
|
923
|
+
token[token_i] = *s;
|
924
|
+
token_i++;
|
925
|
+
}
|
926
|
+
}
|
927
|
+
tk_set(&(CTS(ts)->token), token, token_i,
|
928
|
+
(int)(start - ts->text),
|
929
|
+
(int)(t - ts->text), 1);
|
930
|
+
}
|
931
|
+
else { /* just return the url as is */
|
932
|
+
tk_set_ts(&(CTS(ts)->token), start, t, ts->text, 1);
|
933
|
+
}
|
934
|
+
}
|
935
|
+
else { /* return the number */
|
936
|
+
ts->t = num_end;
|
937
|
+
tk_set_ts(&(CTS(ts)->token), start, num_end, ts->text, 1);
|
835
938
|
}
|
836
|
-
tk_set(ts->token, token, token_i, (int)(start - ts->text),
|
837
|
-
(int)(t - ts->text), 1);
|
838
|
-
} else { // just return the url as is
|
839
|
-
tk_set_ts(ts->token, start, t, ts->text, 1);
|
840
|
-
}
|
841
|
-
} else { // return the number
|
842
|
-
ts->t = num_end;
|
843
|
-
tk_set_ts(ts->token, start, num_end, ts->text, 1);
|
844
939
|
}
|
845
|
-
}
|
846
940
|
|
847
|
-
|
941
|
+
return &(CTS(ts)->token);
|
848
942
|
}
|
849
943
|
|
850
|
-
|
944
|
+
static TokenStream *std_ts_clone_i(TokenStream *orig_ts)
|
851
945
|
{
|
852
|
-
|
853
|
-
ts_standard_destroy(ts);
|
946
|
+
return ts_clone_size(orig_ts, sizeof(StandardTokenizer));
|
854
947
|
}
|
855
948
|
|
856
|
-
|
949
|
+
static TokenStream *std_ts_new()
|
857
950
|
{
|
858
|
-
|
859
|
-
|
951
|
+
TokenStream *ts = ts_new(StandardTokenizer);
|
952
|
+
|
953
|
+
ts->clone_i = &std_ts_clone_i;
|
954
|
+
ts->next = &std_next;
|
955
|
+
|
956
|
+
return ts;
|
860
957
|
}
|
861
958
|
|
862
|
-
TokenStream *
|
959
|
+
TokenStream *standard_tokenizer_new()
|
863
960
|
{
|
864
|
-
|
961
|
+
TokenStream *ts = std_ts_new();
|
865
962
|
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
std_tz->get_apostrophe = &std_get_apostrophe;
|
963
|
+
STDTS(ts)->advance_to_start = &std_advance_to_start;
|
964
|
+
STDTS(ts)->get_alpha = &std_get_alpha;
|
965
|
+
STDTS(ts)->is_tok_char = &std_is_tok_char;
|
966
|
+
STDTS(ts)->get_apostrophe = &std_get_apostrophe;
|
871
967
|
|
872
|
-
|
873
|
-
ts->destroy = &std_ts_destroy;
|
874
|
-
ts->clone_i = &std_ts_clone_i;
|
875
|
-
ts->next = &std_next;
|
876
|
-
return ts;
|
968
|
+
return ts;
|
877
969
|
}
|
878
970
|
|
879
|
-
TokenStream *
|
971
|
+
TokenStream *mb_standard_tokenizer_new()
|
880
972
|
{
|
881
|
-
|
973
|
+
TokenStream *ts = std_ts_new();
|
882
974
|
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
std_tz->get_apostrophe = &mb_std_get_apostrophe;
|
975
|
+
STDTS(ts)->advance_to_start = &mb_std_advance_to_start;
|
976
|
+
STDTS(ts)->get_alpha = &mb_std_get_alpha;
|
977
|
+
STDTS(ts)->is_tok_char = &mb_std_is_tok_char;
|
978
|
+
STDTS(ts)->get_apostrophe = &mb_std_get_apostrophe;
|
888
979
|
|
889
|
-
|
890
|
-
ts->destroy = &std_ts_destroy;
|
891
|
-
ts->clone_i = &std_ts_clone_i;
|
892
|
-
ts->next = &std_next;
|
893
|
-
return ts;
|
980
|
+
return ts;
|
894
981
|
}
|
895
982
|
|
896
|
-
|
983
|
+
/****************************************************************************
|
984
|
+
*
|
985
|
+
* Filters
|
986
|
+
*
|
987
|
+
****************************************************************************/
|
988
|
+
|
989
|
+
#define TkFilt(filter) ((TokenFilter *)(filter))
|
990
|
+
|
991
|
+
TokenStream *filter_clone_size(TokenStream *ts, size_t size)
|
897
992
|
{
|
898
|
-
|
993
|
+
TokenStream *ts_new = ts_clone_size(ts, size);
|
994
|
+
TkFilt(ts_new)->sub_ts = TkFilt(ts)->sub_ts->clone_i(TkFilt(ts)->sub_ts);
|
995
|
+
return ts_new;
|
899
996
|
}
|
900
997
|
|
901
|
-
|
998
|
+
static TokenStream *filter_clone_i(TokenStream *ts)
|
902
999
|
{
|
903
|
-
|
904
|
-
if (tf->token != NULL) tk_destroy(tf->token);
|
905
|
-
free(tf);
|
1000
|
+
return filter_clone_size(ts, sizeof(TokenFilter));
|
906
1001
|
}
|
907
1002
|
|
908
|
-
|
1003
|
+
static TokenStream *filter_reset(TokenStream *ts, char *text)
|
1004
|
+
{
|
1005
|
+
TkFilt(ts)->sub_ts->reset(TkFilt(ts)->sub_ts, text);
|
1006
|
+
return ts;
|
1007
|
+
}
|
1008
|
+
|
1009
|
+
static void filter_destroy_i(TokenStream *ts)
|
1010
|
+
{
|
1011
|
+
ts_deref(TkFilt(ts)->sub_ts);
|
1012
|
+
free(ts);
|
1013
|
+
}
|
1014
|
+
|
1015
|
+
#define tf_new(type, sub) tf_new_i(sizeof(type), sub)
|
1016
|
+
TokenStream *tf_new_i(size_t size, TokenStream *sub_ts)
|
1017
|
+
{
|
1018
|
+
TokenStream *ts = (TokenStream *)ecalloc(size);
|
1019
|
+
|
1020
|
+
TkFilt(ts)->sub_ts = sub_ts;
|
1021
|
+
|
1022
|
+
ts->clone_i = &filter_clone_i;
|
1023
|
+
ts->destroy_i = &filter_destroy_i;
|
1024
|
+
ts->reset = &filter_reset;
|
1025
|
+
ts->ref_cnt = 1;
|
1026
|
+
|
1027
|
+
return ts;
|
1028
|
+
}
|
1029
|
+
|
1030
|
+
/****************************************************************************
|
1031
|
+
* StopFilter
|
1032
|
+
****************************************************************************/
|
1033
|
+
|
1034
|
+
#define StopFilt(filter) ((StopFilter *)(filter))
|
1035
|
+
|
1036
|
+
static void sf_destroy_i(TokenStream *ts)
|
909
1037
|
{
|
910
|
-
|
911
|
-
|
912
|
-
filter_destroy(tf);
|
1038
|
+
h_destroy(StopFilt(ts)->words);
|
1039
|
+
filter_destroy_i(ts);
|
913
1040
|
}
|
914
1041
|
|
915
|
-
void sf_clone_i_i(void *key, void *value, void *arg)
|
1042
|
+
static void sf_clone_i_i(void *key, void *value, void *arg)
|
916
1043
|
{
|
917
|
-
|
918
|
-
|
919
|
-
|
1044
|
+
HashTable *word_table = (HashTable *)arg;
|
1045
|
+
char *word = estrdup(key);
|
1046
|
+
(void)value;
|
1047
|
+
h_set(word_table, word, word);
|
920
1048
|
}
|
921
1049
|
|
922
|
-
|
1050
|
+
static TokenStream *sf_clone_i(TokenStream *orig_ts)
|
923
1051
|
{
|
924
|
-
|
925
|
-
|
1052
|
+
TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(StopFilter));
|
1053
|
+
StopFilt(new_ts)->words = h_new_str(&free, NULL);
|
1054
|
+
h_each(StopFilt(orig_ts)->words, &sf_clone_i_i, StopFilt(new_ts)->words);
|
1055
|
+
return new_ts;
|
926
1056
|
}
|
927
1057
|
|
928
|
-
Token *sf_next(TokenStream *
|
1058
|
+
static Token *sf_next(TokenStream *ts)
|
929
1059
|
{
|
930
|
-
|
931
|
-
|
932
|
-
|
933
|
-
|
934
|
-
|
935
|
-
|
936
|
-
|
937
|
-
|
938
|
-
|
1060
|
+
int pos_inc = 1;
|
1061
|
+
HashTable *words = StopFilt(ts)->words;
|
1062
|
+
TokenFilter *tf = TkFilt(ts);
|
1063
|
+
Token *tk = tf->sub_ts->next(tf->sub_ts);
|
1064
|
+
|
1065
|
+
while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
|
1066
|
+
tk = tf->sub_ts->next(tf->sub_ts);
|
1067
|
+
pos_inc++;
|
1068
|
+
}
|
1069
|
+
|
1070
|
+
if (tk != NULL) {
|
1071
|
+
tk->pos_inc = pos_inc;
|
1072
|
+
}
|
1073
|
+
|
1074
|
+
return tk;
|
939
1075
|
}
|
940
1076
|
|
941
|
-
TokenStream *
|
942
|
-
|
1077
|
+
TokenStream *stop_filter_new_with_words_len(TokenStream *sub_ts,
|
1078
|
+
const char **words, int len)
|
943
1079
|
{
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
tf->sub_ts = ts;
|
1080
|
+
int i;
|
1081
|
+
char *word;
|
1082
|
+
HashTable *word_table = h_new_str(&free, (free_ft) NULL);
|
1083
|
+
TokenStream *ts = tf_new(StopFilter, sub_ts);
|
949
1084
|
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
tf->clone_i = &sf_clone_i;
|
960
|
-
tf->ref_cnt = 1;
|
961
|
-
return tf;
|
1085
|
+
for (i = 0; i < len; i++) {
|
1086
|
+
word = estrdup(words[i]);
|
1087
|
+
h_set(word_table, word, word);
|
1088
|
+
}
|
1089
|
+
StopFilt(ts)->words = word_table;
|
1090
|
+
ts->next = &sf_next;
|
1091
|
+
ts->destroy_i = &sf_destroy_i;
|
1092
|
+
ts->clone_i = &sf_clone_i;
|
1093
|
+
return ts;
|
962
1094
|
}
|
963
1095
|
|
964
|
-
TokenStream *
|
1096
|
+
TokenStream *stop_filter_new_with_words(TokenStream *sub_ts,
|
1097
|
+
const char **words)
|
965
1098
|
{
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
tf->ref_cnt = 1;
|
982
|
-
return tf;
|
1099
|
+
char *word;
|
1100
|
+
HashTable *word_table = h_new_str(&free, (free_ft) NULL);
|
1101
|
+
TokenStream *ts = tf_new(StopFilter, sub_ts);
|
1102
|
+
|
1103
|
+
while (*words) {
|
1104
|
+
word = estrdup(*words);
|
1105
|
+
h_set(word_table, word, word);
|
1106
|
+
words++;
|
1107
|
+
}
|
1108
|
+
|
1109
|
+
StopFilt(ts)->words = word_table;
|
1110
|
+
ts->next = &sf_next;
|
1111
|
+
ts->destroy_i = &sf_destroy_i;
|
1112
|
+
ts->clone_i = &sf_clone_i;
|
1113
|
+
return ts;
|
983
1114
|
}
|
984
1115
|
|
985
|
-
TokenStream *
|
1116
|
+
TokenStream *stop_filter_new(TokenStream *ts)
|
986
1117
|
{
|
987
|
-
|
1118
|
+
return stop_filter_new_with_words(ts, FULL_ENGLISH_STOP_WORDS);
|
988
1119
|
}
|
989
1120
|
|
1121
|
+
/****************************************************************************
|
1122
|
+
* LowerCaseFilter
|
1123
|
+
****************************************************************************/
|
1124
|
+
|
1125
|
+
|
990
1126
|
Token *mb_lcf_next(TokenStream *ts)
|
991
1127
|
{
|
992
|
-
|
993
|
-
|
994
|
-
int i;
|
995
|
-
Token *tk = ts->sub_ts->next(ts->sub_ts);
|
996
|
-
if (tk == NULL) return tk;
|
1128
|
+
wchar_t wbuf[MAX_WORD_SIZE], *wchr;
|
1129
|
+
Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
|
997
1130
|
|
998
|
-
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1002
|
-
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
1131
|
+
if (tk == NULL) {
|
1132
|
+
return tk;
|
1133
|
+
}
|
1134
|
+
|
1135
|
+
mbstowcs(wbuf, tk->text, MAX_WORD_SIZE);
|
1136
|
+
wchr = wbuf;
|
1137
|
+
while (*wchr != 0) {
|
1138
|
+
*wchr = towlower(*wchr);
|
1139
|
+
wchr++;
|
1140
|
+
}
|
1141
|
+
tk->len = wcstombs(tk->text, wbuf, MAX_WORD_SIZE);
|
1142
|
+
tk->text[tk->len] = '\0';
|
1143
|
+
return tk;
|
1006
1144
|
}
|
1007
1145
|
|
1008
|
-
TokenStream *
|
1146
|
+
TokenStream *mb_lowercase_filter_new(TokenStream *sub_ts)
|
1009
1147
|
{
|
1010
|
-
|
1011
|
-
|
1012
|
-
|
1013
|
-
tf->reset = &filter_reset;
|
1014
|
-
tf->destroy = &filter_destroy;
|
1015
|
-
tf->sub_ts = ts;
|
1016
|
-
tf->clone_i = NULL;
|
1017
|
-
tf->ref_cnt = 1;
|
1018
|
-
return tf;
|
1148
|
+
TokenStream *ts = tf_new(TokenFilter, sub_ts);
|
1149
|
+
ts->next = &mb_lcf_next;
|
1150
|
+
return ts;
|
1019
1151
|
}
|
1020
1152
|
|
1021
1153
|
Token *lcf_next(TokenStream *ts)
|
1022
1154
|
{
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
i
|
1029
|
-
|
1030
|
-
|
1155
|
+
int i = 0;
|
1156
|
+
Token *tk = TkFilt(ts)->sub_ts->next(TkFilt(ts)->sub_ts);
|
1157
|
+
if (tk == NULL) {
|
1158
|
+
return tk;
|
1159
|
+
}
|
1160
|
+
while (tk->text[i] != '\0') {
|
1161
|
+
tk->text[i] = tolower(tk->text[i]);
|
1162
|
+
i++;
|
1163
|
+
}
|
1164
|
+
return tk;
|
1031
1165
|
}
|
1032
1166
|
|
1033
|
-
TokenStream *
|
1167
|
+
TokenStream *lowercase_filter_new(TokenStream *sub_ts)
|
1034
1168
|
{
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
tf->reset = &filter_reset;
|
1039
|
-
tf->destroy = &filter_destroy;
|
1040
|
-
tf->sub_ts = ts;
|
1041
|
-
tf->clone_i = NULL;
|
1042
|
-
tf->ref_cnt = 1;
|
1043
|
-
return tf;
|
1169
|
+
TokenStream *ts = tf_new(TokenFilter, sub_ts);
|
1170
|
+
ts->next = &lcf_next;
|
1171
|
+
return ts;
|
1044
1172
|
}
|
1045
1173
|
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
|
1050
|
-
|
1174
|
+
/****************************************************************************
|
1175
|
+
* StemFilter
|
1176
|
+
****************************************************************************/
|
1177
|
+
|
1178
|
+
#define StemFilt(filter) ((StemFilter *)(filter))
|
1051
1179
|
|
1052
|
-
void
|
1180
|
+
void stemf_destroy_i(TokenStream *ts)
|
1053
1181
|
{
|
1054
|
-
|
1055
|
-
|
1056
|
-
|
1057
|
-
|
1058
|
-
free(stemf);
|
1059
|
-
filter_destroy(tf);
|
1182
|
+
sb_stemmer_delete(StemFilt(ts)->stemmer);
|
1183
|
+
free(StemFilt(ts)->algorithm);
|
1184
|
+
free(StemFilt(ts)->charenc);
|
1185
|
+
filter_destroy_i(ts);
|
1060
1186
|
}
|
1061
1187
|
|
1062
1188
|
Token *stemf_next(TokenStream *ts)
|
1063
1189
|
{
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1070
|
-
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
1190
|
+
int len;
|
1191
|
+
const sb_symbol *stemmed;
|
1192
|
+
struct sb_stemmer *stemmer = StemFilt(ts)->stemmer;
|
1193
|
+
TokenFilter *tf = TkFilt(ts);
|
1194
|
+
Token *tk = tf->sub_ts->next(tf->sub_ts);
|
1195
|
+
if (tk == NULL) {
|
1196
|
+
return tk;
|
1197
|
+
}
|
1198
|
+
stemmed = sb_stemmer_stem(stemmer, (sb_symbol *)tk->text, tk->len);
|
1199
|
+
len = sb_stemmer_length(stemmer);
|
1200
|
+
if (len >= MAX_WORD_SIZE) {
|
1201
|
+
len = MAX_WORD_SIZE - 1;
|
1202
|
+
}
|
1203
|
+
|
1204
|
+
memcpy(tk->text, stemmed, len);
|
1205
|
+
tk->text[len] = '\0';
|
1206
|
+
tk->len = len;
|
1207
|
+
return tk;
|
1075
1208
|
}
|
1076
1209
|
|
1077
|
-
|
1210
|
+
TokenStream *stemf_clone_i(TokenStream *orig_ts)
|
1078
1211
|
{
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1212
|
+
TokenStream *new_ts = filter_clone_size(orig_ts, sizeof(StemFilter));
|
1213
|
+
StemFilter *stemf = StemFilt(new_ts);
|
1214
|
+
StemFilter *orig_stemf = StemFilt(orig_ts);
|
1215
|
+
stemf->stemmer =
|
1216
|
+
sb_stemmer_new(orig_stemf->algorithm, orig_stemf->charenc);
|
1217
|
+
stemf->algorithm =
|
1218
|
+
orig_stemf->algorithm ? estrdup(orig_stemf->algorithm) : NULL;
|
1219
|
+
stemf->charenc =
|
1220
|
+
orig_stemf->charenc ? estrdup(orig_stemf->charenc) : NULL;
|
1221
|
+
return new_ts;
|
1085
1222
|
}
|
1086
1223
|
|
1087
|
-
TokenStream *
|
1088
|
-
|
1224
|
+
TokenStream *stem_filter_new(TokenStream *ts, const char *algorithm,
|
1225
|
+
const char *charenc)
|
1089
1226
|
{
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
tf->data = stemf;
|
1227
|
+
TokenStream *tf = tf_new(StemFilter, ts);
|
1228
|
+
|
1229
|
+
StemFilt(tf)->stemmer = sb_stemmer_new(algorithm, charenc);
|
1230
|
+
StemFilt(tf)->algorithm = algorithm ? estrdup(algorithm) : NULL;
|
1231
|
+
StemFilt(tf)->charenc = charenc ? estrdup(charenc) : NULL;
|
1096
1232
|
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
tf->clone_i = &stemf_clone_i;
|
1102
|
-
tf->sub_ts = ts;
|
1103
|
-
tf->ref_cnt = 1;
|
1104
|
-
return tf;
|
1233
|
+
tf->next = &stemf_next;
|
1234
|
+
tf->destroy_i = &stemf_destroy_i;
|
1235
|
+
tf->clone_i = &stemf_clone_i;
|
1236
|
+
return tf;
|
1105
1237
|
}
|
1106
1238
|
|
1107
|
-
|
1108
|
-
|
1239
|
+
/****************************************************************************
|
1240
|
+
*
|
1241
|
+
* Analyzers
|
1242
|
+
*
|
1243
|
+
****************************************************************************/
|
1244
|
+
|
1245
|
+
/****************************************************************************
|
1246
|
+
* Standard
|
1247
|
+
****************************************************************************/
|
1248
|
+
|
1249
|
+
Analyzer *standard_analyzer_new_with_words_len(const char **words, int len,
|
1250
|
+
bool lowercase)
|
1109
1251
|
{
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1252
|
+
TokenStream *ts;
|
1253
|
+
if (lowercase) {
|
1254
|
+
ts = stop_filter_new_with_words_len(lowercase_filter_new
|
1255
|
+
(standard_tokenizer_new()),
|
1256
|
+
words, len);
|
1257
|
+
}
|
1258
|
+
else {
|
1259
|
+
ts = stop_filter_new_with_words_len(standard_tokenizer_new(),
|
1260
|
+
words, len);
|
1261
|
+
}
|
1262
|
+
return analyzer_new(ts, NULL, NULL);
|
1119
1263
|
}
|
1120
1264
|
|
1121
|
-
Analyzer *
|
1265
|
+
Analyzer *standard_analyzer_new_with_words(const char **words,
|
1266
|
+
bool lowercase)
|
1122
1267
|
{
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1268
|
+
TokenStream *ts;
|
1269
|
+
if (lowercase) {
|
1270
|
+
ts = stop_filter_new_with_words(lowercase_filter_new
|
1271
|
+
(standard_tokenizer_new()),
|
1272
|
+
words);
|
1273
|
+
}
|
1274
|
+
else {
|
1275
|
+
ts = stop_filter_new_with_words(standard_tokenizer_new(),
|
1276
|
+
words);
|
1277
|
+
}
|
1278
|
+
return analyzer_new(ts, NULL, NULL);
|
1132
1279
|
}
|
1133
1280
|
|
1134
|
-
Analyzer *
|
1135
|
-
|
1281
|
+
Analyzer *mb_standard_analyzer_new_with_words_len(const char **words,
|
1282
|
+
int len, bool lowercase)
|
1136
1283
|
{
|
1137
|
-
|
1138
|
-
|
1139
|
-
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1284
|
+
TokenStream *ts;
|
1285
|
+
if (lowercase) {
|
1286
|
+
ts = stop_filter_new_with_words_len(mb_lowercase_filter_new
|
1287
|
+
(mb_standard_tokenizer_new
|
1288
|
+
()), words, len);
|
1289
|
+
}
|
1290
|
+
else {
|
1291
|
+
ts = stop_filter_new_with_words_len(mb_standard_tokenizer_new(),
|
1292
|
+
words, len);
|
1293
|
+
}
|
1294
|
+
return analyzer_new(ts, NULL, NULL);
|
1146
1295
|
}
|
1147
1296
|
|
1148
|
-
Analyzer *
|
1149
|
-
|
1297
|
+
Analyzer *mb_standard_analyzer_new_with_words(const char **words,
|
1298
|
+
bool lowercase)
|
1150
1299
|
{
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1300
|
+
TokenStream *ts;
|
1301
|
+
if (lowercase) {
|
1302
|
+
ts = stop_filter_new_with_words(mb_lowercase_filter_new
|
1303
|
+
(mb_standard_tokenizer_new()),
|
1304
|
+
words);
|
1305
|
+
}
|
1306
|
+
else {
|
1307
|
+
ts = stop_filter_new_with_words(mb_standard_tokenizer_new(),
|
1308
|
+
words);
|
1309
|
+
}
|
1310
|
+
return analyzer_new(ts, NULL, NULL);
|
1159
1311
|
}
|
1160
1312
|
|
1161
|
-
Analyzer *
|
1313
|
+
Analyzer *standard_analyzer_new(bool lowercase)
|
1162
1314
|
{
|
1163
|
-
|
1315
|
+
return standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
|
1316
|
+
lowercase);
|
1164
1317
|
}
|
1165
1318
|
|
1166
|
-
Analyzer *
|
1319
|
+
Analyzer *mb_standard_analyzer_new(bool lowercase)
|
1167
1320
|
{
|
1168
|
-
|
1321
|
+
return mb_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS,
|
1322
|
+
lowercase);
|
1169
1323
|
}
|
1170
1324
|
|
1171
1325
|
/****************************************************************************
|
@@ -1174,58 +1328,63 @@ Analyzer *mb_standard_analyzer_create(bool lowercase)
|
|
1174
1328
|
*
|
1175
1329
|
****************************************************************************/
|
1176
1330
|
|
1177
|
-
|
1331
|
+
#define PFA(analyzer) ((PerFieldAnalyzer *)(analyzer))
|
1332
|
+
void pfa_destroy_i(Analyzer *self)
|
1178
1333
|
{
|
1179
|
-
|
1180
|
-
h_destroy(pfa->dict);
|
1334
|
+
h_destroy(PFA(self)->dict);
|
1181
1335
|
|
1182
|
-
|
1183
|
-
|
1184
|
-
free(self);
|
1336
|
+
a_deref(PFA(self)->default_a);
|
1337
|
+
free(self);
|
1185
1338
|
}
|
1186
1339
|
|
1187
1340
|
TokenStream *pfa_get_ts(Analyzer *self, char *field, char *text)
|
1188
1341
|
{
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1342
|
+
Analyzer *a = h_get(PFA(self)->dict, field);
|
1343
|
+
if (a == NULL) {
|
1344
|
+
a = PFA(self)->default_a;
|
1345
|
+
}
|
1346
|
+
return a_get_ts(a, field, text);
|
1193
1347
|
}
|
1194
1348
|
|
1195
|
-
void
|
1349
|
+
void pfa_sub_a_destroy_i(void *p)
|
1196
1350
|
{
|
1197
|
-
|
1198
|
-
|
1351
|
+
Analyzer *a = (Analyzer *) p;
|
1352
|
+
a_deref(a);
|
1199
1353
|
}
|
1200
1354
|
|
1201
1355
|
void pfa_add_field(Analyzer *self, char *field, Analyzer *analyzer)
|
1202
1356
|
{
|
1203
|
-
|
1204
|
-
h_set(pfa->dict, estrdup(field), analyzer);
|
1357
|
+
h_set(PFA(self)->dict, estrdup(field), analyzer);
|
1205
1358
|
}
|
1206
1359
|
|
1207
|
-
Analyzer *
|
1360
|
+
Analyzer *per_field_analyzer_new(Analyzer *default_a)
|
1208
1361
|
{
|
1209
|
-
|
1210
|
-
|
1211
|
-
|
1212
|
-
|
1362
|
+
Analyzer *a = (Analyzer *)ecalloc(sizeof(PerFieldAnalyzer));
|
1363
|
+
|
1364
|
+
PFA(a)->default_a = default_a;
|
1365
|
+
PFA(a)->dict = h_new_str(&free, &pfa_sub_a_destroy_i);
|
1366
|
+
|
1367
|
+
a->destroy_i = &pfa_destroy_i;
|
1368
|
+
a->get_ts = pfa_get_ts;
|
1369
|
+
|
1370
|
+
return a;
|
1213
1371
|
}
|
1214
1372
|
|
1215
1373
|
#ifdef ALONE
|
1216
1374
|
int main(int argc, char **argv)
|
1217
1375
|
{
|
1218
|
-
|
1219
|
-
|
1220
|
-
|
1221
|
-
|
1222
|
-
|
1223
|
-
|
1224
|
-
|
1225
|
-
|
1226
|
-
|
1227
|
-
|
1228
|
-
|
1229
|
-
|
1376
|
+
char buf[10000];
|
1377
|
+
Analyzer *a = standard_analyzer_new(true);
|
1378
|
+
TokenStream *ts;
|
1379
|
+
Token *tk;
|
1380
|
+
while (fgets(buf, 9999, stdin) != NULL) {
|
1381
|
+
ts = a_get_ts(a, "hello", buf);
|
1382
|
+
while ((tk = ts->next(ts)) != NULL) {
|
1383
|
+
printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
|
1384
|
+
}
|
1385
|
+
printf("\n");
|
1386
|
+
ts_deref(ts);
|
1387
|
+
}
|
1388
|
+
return 0;
|
1230
1389
|
}
|
1231
1390
|
#endif
|