ferret 0.3.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +9 -0
 - data/Rakefile +51 -25
 - data/ext/analysis.c +553 -0
 - data/ext/analysis.h +76 -0
 - data/ext/array.c +83 -0
 - data/ext/array.h +19 -0
 - data/ext/bitvector.c +164 -0
 - data/ext/bitvector.h +29 -0
 - data/ext/compound_io.c +335 -0
 - data/ext/document.c +336 -0
 - data/ext/document.h +87 -0
 - data/ext/ferret.c +88 -47
 - data/ext/ferret.h +43 -109
 - data/ext/field.c +395 -0
 - data/ext/filter.c +103 -0
 - data/ext/fs_store.c +352 -0
 - data/ext/global.c +219 -0
 - data/ext/global.h +73 -0
 - data/ext/hash.c +446 -0
 - data/ext/hash.h +80 -0
 - data/ext/hashset.c +141 -0
 - data/ext/hashset.h +37 -0
 - data/ext/helper.c +11 -0
 - data/ext/helper.h +5 -0
 - data/ext/inc/lang.h +41 -0
 - data/ext/ind.c +389 -0
 - data/ext/index.h +884 -0
 - data/ext/index_io.c +269 -415
 - data/ext/index_rw.c +2543 -0
 - data/ext/lang.c +31 -0
 - data/ext/lang.h +41 -0
 - data/ext/priorityqueue.c +228 -0
 - data/ext/priorityqueue.h +44 -0
 - data/ext/q_boolean.c +1331 -0
 - data/ext/q_const_score.c +154 -0
 - data/ext/q_fuzzy.c +287 -0
 - data/ext/q_match_all.c +142 -0
 - data/ext/q_multi_phrase.c +343 -0
 - data/ext/q_parser.c +2180 -0
 - data/ext/q_phrase.c +657 -0
 - data/ext/q_prefix.c +75 -0
 - data/ext/q_range.c +247 -0
 - data/ext/q_span.c +1566 -0
 - data/ext/q_term.c +308 -0
 - data/ext/q_wildcard.c +146 -0
 - data/ext/r_analysis.c +255 -0
 - data/ext/r_doc.c +578 -0
 - data/ext/r_index_io.c +996 -0
 - data/ext/r_qparser.c +158 -0
 - data/ext/r_search.c +2321 -0
 - data/ext/r_store.c +263 -0
 - data/ext/r_term.c +219 -0
 - data/ext/ram_store.c +447 -0
 - data/ext/search.c +524 -0
 - data/ext/search.h +1065 -0
 - data/ext/similarity.c +143 -39
 - data/ext/sort.c +661 -0
 - data/ext/store.c +35 -0
 - data/ext/store.h +152 -0
 - data/ext/term.c +704 -143
 - data/ext/termdocs.c +599 -0
 - data/ext/vector.c +594 -0
 - data/lib/ferret.rb +9 -10
 - data/lib/ferret/analysis/analyzers.rb +2 -2
 - data/lib/ferret/analysis/standard_tokenizer.rb +1 -1
 - data/lib/ferret/analysis/token.rb +14 -14
 - data/lib/ferret/analysis/token_filters.rb +3 -3
 - data/lib/ferret/document/field.rb +16 -17
 - data/lib/ferret/index/document_writer.rb +4 -4
 - data/lib/ferret/index/index.rb +39 -23
 - data/lib/ferret/index/index_writer.rb +2 -2
 - data/lib/ferret/index/multiple_term_doc_pos_enum.rb +1 -8
 - data/lib/ferret/index/segment_term_vector.rb +4 -4
 - data/lib/ferret/index/term.rb +5 -1
 - data/lib/ferret/index/term_vector_offset_info.rb +6 -6
 - data/lib/ferret/index/term_vectors_io.rb +5 -5
 - data/lib/ferret/query_parser/query_parser.tab.rb +81 -77
 - data/lib/ferret/search.rb +1 -1
 - data/lib/ferret/search/boolean_query.rb +2 -1
 - data/lib/ferret/search/field_sorted_hit_queue.rb +3 -3
 - data/lib/ferret/search/fuzzy_query.rb +2 -1
 - data/lib/ferret/search/index_searcher.rb +3 -0
 - data/lib/ferret/search/{match_all_docs_query.rb → match_all_query.rb} +7 -7
 - data/lib/ferret/search/multi_phrase_query.rb +6 -5
 - data/lib/ferret/search/phrase_query.rb +3 -6
 - data/lib/ferret/search/prefix_query.rb +4 -4
 - data/lib/ferret/search/sort.rb +3 -1
 - data/lib/ferret/search/sort_field.rb +9 -9
 - data/lib/ferret/search/spans/near_spans_enum.rb +1 -1
 - data/lib/ferret/search/spans/span_near_query.rb +1 -1
 - data/lib/ferret/search/spans/span_weight.rb +1 -1
 - data/lib/ferret/search/spans/spans_enum.rb +7 -7
 - data/lib/ferret/store/fs_store.rb +10 -6
 - data/lib/ferret/store/ram_store.rb +3 -3
 - data/lib/rferret.rb +36 -0
 - data/test/functional/thread_safety_index_test.rb +2 -2
 - data/test/test_helper.rb +16 -2
 - data/test/unit/analysis/c_token.rb +25 -0
 - data/test/unit/analysis/tc_per_field_analyzer_wrapper.rb +1 -1
 - data/test/unit/analysis/tc_standard_analyzer.rb +1 -1
 - data/test/unit/document/{tc_document.rb → c_document.rb} +0 -0
 - data/test/unit/document/c_field.rb +98 -0
 - data/test/unit/document/tc_field.rb +0 -66
 - data/test/unit/index/{tc_index.rb → c_index.rb} +62 -6
 - data/test/unit/index/{tc_index_reader.rb → c_index_reader.rb} +51 -10
 - data/test/unit/index/{tc_index_writer.rb → c_index_writer.rb} +0 -4
 - data/test/unit/index/{tc_term.rb → c_term.rb} +1 -3
 - data/test/unit/index/{tc_term_vector_offset_info.rb → c_term_voi.rb} +5 -5
 - data/test/unit/index/tc_segment_term_vector.rb +2 -2
 - data/test/unit/index/tc_term_vectors_io.rb +4 -4
 - data/test/unit/query_parser/c_query_parser.rb +138 -0
 - data/test/unit/search/{tc_filter.rb → c_filter.rb} +24 -24
 - data/test/unit/search/{tc_fuzzy_query.rb → c_fuzzy_query.rb} +0 -0
 - data/test/unit/search/{tc_index_searcher.rb → c_index_searcher.rb} +9 -26
 - data/test/unit/search/{tc_search_and_sort.rb → c_search_and_sort.rb} +15 -15
 - data/test/unit/search/{tc_sort.rb → c_sort.rb} +2 -1
 - data/test/unit/search/c_sort_field.rb +27 -0
 - data/test/unit/search/{tc_spans.rb → c_spans.rb} +0 -0
 - data/test/unit/search/tc_sort_field.rb +7 -20
 - data/test/unit/store/c_fs_store.rb +76 -0
 - data/test/unit/store/c_ram_store.rb +35 -0
 - data/test/unit/store/m_store.rb +34 -0
 - data/test/unit/store/m_store_lock.rb +68 -0
 - data/test/unit/store/tc_fs_store.rb +0 -53
 - data/test/unit/store/tc_ram_store.rb +0 -20
 - data/test/unit/store/tm_store.rb +0 -30
 - data/test/unit/store/tm_store_lock.rb +0 -66
 - metadata +84 -31
 - data/ext/Makefile +0 -140
 - data/ext/ferret_ext.so +0 -0
 - data/ext/priority_queue.c +0 -232
 - data/ext/ram_directory.c +0 -321
 - data/ext/segment_merge_queue.c +0 -37
 - data/ext/segment_term_enum.c +0 -326
 - data/ext/string_helper.c +0 -42
 - data/ext/tags +0 -344
 - data/ext/term_buffer.c +0 -230
 - data/ext/term_infos_reader.c +0 -54
 - data/ext/terminfo.c +0 -160
 - data/ext/token.c +0 -93
 - data/ext/util.c +0 -12
 
    
        data/CHANGELOG
    ADDED
    
    | 
         @@ -0,0 +1,9 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            20060316:
         
     | 
| 
      
 2 
     | 
    
         
            +
              * changed Token#term_text to Token#text
         
     | 
| 
      
 3 
     | 
    
         
            +
              * changed Token#position_increment to Term#pos_inc
         
     | 
| 
      
 4 
     | 
    
         
            +
              * changed order of args to Token.new. Now Term.new(text, start_offset,
         
     | 
| 
      
 5 
     | 
    
         
            +
                end_offset, pos_inc=1, type="text"). NOTE: type does nothing.
         
     | 
| 
      
 6 
     | 
    
         
            +
              * changed TermVectorOffsetInfo#start_offset to TermVectorOffsetInfo#start
         
     | 
| 
      
 7 
     | 
    
         
            +
              * changed TermVectorOffsetInfo#end_offset to TermVectorOffsetInfo#end
         
     | 
| 
      
 8 
     | 
    
         
            +
              * added :id_field option to Index::Index class.
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
    
        data/Rakefile
    CHANGED
    
    | 
         @@ -9,7 +9,7 @@ require 'rake/testtask' 
     | 
|
| 
       9 
9 
     | 
    
         
             
            require 'rake/rdoctask'
         
     | 
| 
       10 
10 
     | 
    
         
             
            require 'rake/clean'
         
     | 
| 
       11 
11 
     | 
    
         
             
            require 'rake_utils/code_statistics'
         
     | 
| 
       12 
     | 
    
         
            -
            require 'lib/ 
     | 
| 
      
 12 
     | 
    
         
            +
            require 'lib/rferret'
         
     | 
| 
       13 
13 
     | 
    
         | 
| 
       14 
14 
     | 
    
         
             
            begin
         
     | 
| 
       15 
15 
     | 
    
         
             
              require 'rubygems'
         
     | 
| 
         @@ -30,18 +30,32 @@ def announce(msg='') 
     | 
|
| 
       30 
30 
     | 
    
         
             
            end
         
     | 
| 
       31 
31 
     | 
    
         | 
| 
       32 
32 
     | 
    
         
             
            $VERBOSE = nil
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
            EXT = "ferret_ext.so"
         
     | 
| 
      
 35 
     | 
    
         
            +
            EXT_SRC = FileList["src/*/*.[ch]"]
         
     | 
| 
      
 36 
     | 
    
         
            +
            EXT_SRC_DEST = EXT_SRC.map {|fn| File.join("ext", File.basename(fn))}
         
     | 
| 
      
 37 
     | 
    
         
            +
            SRC = (FileList["ext/*.[ch]"] + EXT_SRC_DEST).uniq
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
       33 
39 
     | 
    
         
             
            CLEAN.include(FileList['**/*.o', 'InstalledFiles', '.config'])
         
     | 
| 
       34 
40 
     | 
    
         
             
            CLOBBER.include(FileList['**/*.so'], 'ext/Makefile')
         
     | 
| 
       35 
41 
     | 
    
         | 
| 
       36 
42 
     | 
    
         
             
            task :default => :all_tests
         
     | 
| 
       37 
43 
     | 
    
         
             
            desc "Run all tests"
         
     | 
| 
       38 
     | 
    
         
            -
            task :all_tests => [ : 
     | 
| 
      
 44 
     | 
    
         
            +
            task :all_tests => [ :test_runits, :test_cunits, :test_functional ]
         
     | 
| 
       39 
45 
     | 
    
         | 
| 
       40 
46 
     | 
    
         
             
            desc "Generate API documentation, and show coding stats"
         
     | 
| 
       41 
47 
     | 
    
         
             
            task :doc => [ :stats, :appdoc ]
         
     | 
| 
       42 
48 
     | 
    
         | 
| 
       43 
     | 
    
         
            -
            desc "run unit tests in test/unit"
         
     | 
| 
       44 
     | 
    
         
            -
            Rake::TestTask.new(" 
     | 
| 
      
 49 
     | 
    
         
            +
            desc "run unit tests in test/unit for pure ruby ferret"
         
     | 
| 
      
 50 
     | 
    
         
            +
            Rake::TestTask.new("test_runits" => :parsers) do |t|
         
     | 
| 
      
 51 
     | 
    
         
            +
              t.ruby_opts = ["-r 'lib/rferret'"]
         
     | 
| 
      
 52 
     | 
    
         
            +
              t.libs << "test/unit"
         
     | 
| 
      
 53 
     | 
    
         
            +
              t.pattern = 'test/unit/ts_*.rb'
         
     | 
| 
      
 54 
     | 
    
         
            +
              t.verbose = true
         
     | 
| 
      
 55 
     | 
    
         
            +
            end
         
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
            desc "run unit tests in test/unit for C ferret"
         
     | 
| 
      
 58 
     | 
    
         
            +
            Rake::TestTask.new("test_cunits" => :ext) do |t|
         
     | 
| 
       45 
59 
     | 
    
         
             
              t.libs << "test/unit"
         
     | 
| 
       46 
60 
     | 
    
         
             
              t.pattern = 'test/unit/t[cs]_*.rb'
         
     | 
| 
       47 
61 
     | 
    
         
             
              t.verbose = true
         
     | 
| 
         @@ -84,22 +98,28 @@ rd = Rake::RDocTask.new("appdoc") do |rdoc| 
     | 
|
| 
       84 
98 
     | 
    
         
             
              rdoc.rdoc_files.include('lib/**/*.rb')
         
     | 
| 
       85 
99 
     | 
    
         
             
            end
         
     | 
| 
       86 
100 
     | 
    
         | 
| 
       87 
     | 
    
         
            -
             
     | 
| 
      
 101 
     | 
    
         
            +
            EXT_SRC.each do |fn|
         
     | 
| 
      
 102 
     | 
    
         
            +
              dest_fn = File.join("ext", File.basename(fn))
         
     | 
| 
      
 103 
     | 
    
         
            +
              file dest_fn => fn do |t|
         
     | 
| 
      
 104 
     | 
    
         
            +
                cp fn, dest_fn
         
     | 
| 
      
 105 
     | 
    
         
            +
              end
         
     | 
| 
      
 106 
     | 
    
         
            +
            end
         
     | 
| 
       88 
107 
     | 
    
         | 
| 
       89 
108 
     | 
    
         
             
            desc "Build the extension"
         
     | 
| 
       90 
     | 
    
         
            -
            task :ext => "ext/#{EXT}"
         
     | 
| 
      
 109 
     | 
    
         
            +
            task :ext => ["ext/#{EXT}"] + SRC
         
     | 
| 
       91 
110 
     | 
    
         | 
| 
       92 
     | 
    
         
            -
            file "ext/#{EXT}" => "ext/Makefile" do
         
     | 
| 
      
 111 
     | 
    
         
            +
            file "ext/#{EXT}" => ["ext/Makefile"] do
         
     | 
| 
      
 112 
     | 
    
         
            +
              cp "ext/inc/lang.h", "ext/lang.h"
         
     | 
| 
       93 
113 
     | 
    
         
             
              sh "cd ext; make"
         
     | 
| 
       94 
114 
     | 
    
         
             
            end
         
     | 
| 
       95 
115 
     | 
    
         | 
| 
       96 
     | 
    
         
            -
            file "ext/Makefile" do
         
     | 
| 
      
 116 
     | 
    
         
            +
            file "ext/Makefile" => SRC do
         
     | 
| 
       97 
117 
     | 
    
         
             
              sh "cd ext; ruby extconf.rb"
         
     | 
| 
       98 
118 
     | 
    
         
             
            end
         
     | 
| 
       99 
119 
     | 
    
         | 
| 
       100 
120 
     | 
    
         
             
            # Make Parsers ---------------------------------------------------------------
         
     | 
| 
       101 
121 
     | 
    
         | 
| 
       102 
     | 
    
         
            -
            RACC_SRC = FileList[" 
     | 
| 
      
 122 
     | 
    
         
            +
            RACC_SRC = FileList["lib/**/*.y"]
         
     | 
| 
       103 
123 
     | 
    
         
             
            RACC_OUT = RACC_SRC.collect { |fn| fn.sub(/\.y$/, '.tab.rb') }
         
     | 
| 
       104 
124 
     | 
    
         | 
| 
       105 
125 
     | 
    
         
             
            task :parsers => RACC_OUT
         
     | 
| 
         @@ -195,8 +215,9 @@ end 
     | 
|
| 
       195 
215 
     | 
    
         
             
            # Creating a release
         
     | 
| 
       196 
216 
     | 
    
         | 
| 
       197 
217 
     | 
    
         
             
            desc "Make a new release"
         
     | 
| 
       198 
     | 
    
         
            -
            task :prerelease => [: 
     | 
| 
       199 
     | 
    
         
            -
            task : 
     | 
| 
      
 218 
     | 
    
         
            +
            task :prerelease => [:all_tests, :clobber]
         
     | 
| 
      
 219 
     | 
    
         
            +
            task :repackage => EXT_SRC_DEST
         
     | 
| 
      
 220 
     | 
    
         
            +
            task :package => EXT_SRC_DEST
         
     | 
| 
       200 
221 
     | 
    
         
             
            task :tag => [:prerelease]
         
     | 
| 
       201 
222 
     | 
    
         
             
            task :update_version => [:prerelease]
         
     | 
| 
       202 
223 
     | 
    
         
             
            task :release => [:tag, :update_version, :package] do
         
     | 
| 
         @@ -229,7 +250,7 @@ task :prerelease do 
     | 
|
| 
       229 
250 
     | 
    
         
             
              end
         
     | 
| 
       230 
251 
     | 
    
         | 
| 
       231 
252 
     | 
    
         
             
              # Are all source files checked in?
         
     | 
| 
       232 
     | 
    
         
            -
              data = `svn -q status`
         
     | 
| 
      
 253 
     | 
    
         
            +
              data = `svn -q --ignore-externals status`
         
     | 
| 
       233 
254 
     | 
    
         
             
              unless data =~ /^$/
         
     | 
| 
       234 
255 
     | 
    
         
             
                fail "'svn -q status' is not clean ... do you have unchecked-in files?"
         
     | 
| 
       235 
256 
     | 
    
         
             
              end
         
     | 
| 
         @@ -237,28 +258,33 @@ task :prerelease do 
     | 
|
| 
       237 
258 
     | 
    
         
             
              announce "No outstanding checkins found ... OK"
         
     | 
| 
       238 
259 
     | 
    
         
             
            end
         
     | 
| 
       239 
260 
     | 
    
         | 
| 
      
 261 
     | 
    
         
            +
            def reversion(fn)
         
     | 
| 
      
 262 
     | 
    
         
            +
              open(fn) do |ferret_in|
         
     | 
| 
      
 263 
     | 
    
         
            +
                open(fn + ".new", "w") do |ferret_out|
         
     | 
| 
      
 264 
     | 
    
         
            +
                  ferret_in.each do |line|
         
     | 
| 
      
 265 
     | 
    
         
            +
                    if line =~ /^  VERSION\s*=\s*/
         
     | 
| 
      
 266 
     | 
    
         
            +
                      ferret_out.puts "  VERSION = '#{PKG_VERSION}'"
         
     | 
| 
      
 267 
     | 
    
         
            +
                    else
         
     | 
| 
      
 268 
     | 
    
         
            +
                      ferret_out.puts line
         
     | 
| 
      
 269 
     | 
    
         
            +
                    end
         
     | 
| 
      
 270 
     | 
    
         
            +
                  end
         
     | 
| 
      
 271 
     | 
    
         
            +
                end
         
     | 
| 
      
 272 
     | 
    
         
            +
              end
         
     | 
| 
      
 273 
     | 
    
         
            +
            end
         
     | 
| 
      
 274 
     | 
    
         
            +
             
     | 
| 
       240 
275 
     | 
    
         
             
            task :update_version => [:prerelease] do
         
     | 
| 
       241 
276 
     | 
    
         
             
              if PKG_VERSION == CURRENT_VERSION
         
     | 
| 
       242 
277 
     | 
    
         
             
                announce "No version change ... skipping version update"
         
     | 
| 
       243 
278 
     | 
    
         
             
              else
         
     | 
| 
       244 
279 
     | 
    
         
             
                announce "Updating Ferret version to #{PKG_VERSION}"
         
     | 
| 
       245 
     | 
    
         
            -
                 
     | 
| 
       246 
     | 
    
         
            -
             
     | 
| 
       247 
     | 
    
         
            -
                    ferret_in.each do |line|
         
     | 
| 
       248 
     | 
    
         
            -
                      if line =~ /^  VERSION\s*=\s*/
         
     | 
| 
       249 
     | 
    
         
            -
                        ferret_out.puts "  VERSION = '#{PKG_VERSION}'"
         
     | 
| 
       250 
     | 
    
         
            -
                      else
         
     | 
| 
       251 
     | 
    
         
            -
                        ferret_out.puts line
         
     | 
| 
       252 
     | 
    
         
            -
                      end
         
     | 
| 
       253 
     | 
    
         
            -
                    end
         
     | 
| 
       254 
     | 
    
         
            -
                  end
         
     | 
| 
       255 
     | 
    
         
            -
                end
         
     | 
| 
      
 280 
     | 
    
         
            +
                reversion("lib/ferret.rb")
         
     | 
| 
      
 281 
     | 
    
         
            +
                reversion("lib/rferret.rb")
         
     | 
| 
       256 
282 
     | 
    
         
             
                if ENV['RELTEST']
         
     | 
| 
       257 
283 
     | 
    
         
             
                  announce "Release Task Testing, skipping commiting of new version"
         
     | 
| 
       258 
284 
     | 
    
         
             
                else
         
     | 
| 
       259 
     | 
    
         
            -
                  mv "lib/ 
     | 
| 
      
 285 
     | 
    
         
            +
                  mv "lib/rferret.rb.new", "lib/rferret.rb"
         
     | 
| 
       260 
286 
     | 
    
         
             
                end
         
     | 
| 
       261 
     | 
    
         
            -
                sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/ 
     | 
| 
      
 287 
     | 
    
         
            +
                sh %{svn ci -m "Updated to version #{PKG_VERSION}" lib/rferret.rb}
         
     | 
| 
       262 
288 
     | 
    
         
             
              end
         
     | 
| 
       263 
289 
     | 
    
         
             
            end
         
     | 
| 
       264 
290 
     | 
    
         | 
    
        data/ext/analysis.c
    ADDED
    
    | 
         @@ -0,0 +1,553 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            #include <analysis.h>
         
     | 
| 
      
 2 
     | 
    
         
            +
            #include <string.h>
         
     | 
| 
      
 3 
     | 
    
         
            +
            #include <ctype.h>
         
     | 
| 
      
 4 
     | 
    
         
            +
            #include <hash.h>
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            Token *tk_create()
         
     | 
| 
      
 7 
     | 
    
         
            +
            {
         
     | 
| 
      
 8 
     | 
    
         
            +
              return ALLOC(Token);
         
     | 
| 
      
 9 
     | 
    
         
            +
            }
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            void tk_destroy(void *p)
         
     | 
| 
      
 12 
     | 
    
         
            +
            {
         
     | 
| 
      
 13 
     | 
    
         
            +
              free(p);
         
     | 
| 
      
 14 
     | 
    
         
            +
            }
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
            inline Token *tk_set(Token *tk, char *text, int tlen, int start, int end, int pos_inc)
         
     | 
| 
      
 17 
     | 
    
         
            +
            {
         
     | 
| 
      
 18 
     | 
    
         
            +
              if (tlen >= MAX_WORD_SIZE) tlen = MAX_WORD_SIZE - 1;
         
     | 
| 
      
 19 
     | 
    
         
            +
              memcpy(tk->text, text, sizeof(char) * tlen);
         
     | 
| 
      
 20 
     | 
    
         
            +
              tk->text[tlen] = '\0';
         
     | 
| 
      
 21 
     | 
    
         
            +
              tk->start = start;
         
     | 
| 
      
 22 
     | 
    
         
            +
              tk->end = end;
         
     | 
| 
      
 23 
     | 
    
         
            +
              tk->pos_inc = pos_inc;
         
     | 
| 
      
 24 
     | 
    
         
            +
              return tk;
         
     | 
| 
      
 25 
     | 
    
         
            +
            }
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
            inline Token *tk_set_no_len(Token *tk, char *text, int start, int end, int pos_inc)
         
     | 
| 
      
 28 
     | 
    
         
            +
            {
         
     | 
| 
      
 29 
     | 
    
         
            +
              return tk_set(tk, text, strlen(text), start, end, pos_inc);
         
     | 
| 
      
 30 
     | 
    
         
            +
            }
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
            int tk_eq(Token *tk1, Token *tk2)
         
     | 
| 
      
 33 
     | 
    
         
            +
            {
         
     | 
| 
      
 34 
     | 
    
         
            +
              if (strcmp((char *)tk1->text, (char *)tk2->text) == 0 &&
         
     | 
| 
      
 35 
     | 
    
         
            +
                  tk1->start == tk2->start && tk1->end == tk2->end)
         
     | 
| 
      
 36 
     | 
    
         
            +
                return true;
         
     | 
| 
      
 37 
     | 
    
         
            +
              else
         
     | 
| 
      
 38 
     | 
    
         
            +
                return false;
         
     | 
| 
      
 39 
     | 
    
         
            +
            }
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
            int tk_cmp(Token *tk1, Token *tk2)
         
     | 
| 
      
 42 
     | 
    
         
            +
            {
         
     | 
| 
      
 43 
     | 
    
         
            +
              int cmp;
         
     | 
| 
      
 44 
     | 
    
         
            +
              if (tk1->start > tk2->start) {
         
     | 
| 
      
 45 
     | 
    
         
            +
                cmp = 1;
         
     | 
| 
      
 46 
     | 
    
         
            +
              } else if (tk1->start < tk2->start) {
         
     | 
| 
      
 47 
     | 
    
         
            +
                cmp = -1;
         
     | 
| 
      
 48 
     | 
    
         
            +
              } else {
         
     | 
| 
      
 49 
     | 
    
         
            +
                if (tk1->end > tk2->end) {
         
     | 
| 
      
 50 
     | 
    
         
            +
                  cmp = 1;
         
     | 
| 
      
 51 
     | 
    
         
            +
                } else if (tk1->end < tk2->end) {
         
     | 
| 
      
 52 
     | 
    
         
            +
                  cmp = -1;
         
     | 
| 
      
 53 
     | 
    
         
            +
                } else {
         
     | 
| 
      
 54 
     | 
    
         
            +
                  cmp = strcmp((char *)tk1->text, (char *)tk2->text);
         
     | 
| 
      
 55 
     | 
    
         
            +
                }
         
     | 
| 
      
 56 
     | 
    
         
            +
              }
         
     | 
| 
      
 57 
     | 
    
         
            +
              return cmp;
         
     | 
| 
      
 58 
     | 
    
         
            +
            }
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
            void ts_standard_destroy(void *p)
         
     | 
| 
      
 61 
     | 
    
         
            +
            {
         
     | 
| 
      
 62 
     | 
    
         
            +
              TokenStream *ts = (TokenStream *)p;
         
     | 
| 
      
 63 
     | 
    
         
            +
              tk_destroy(ts->token);
         
     | 
| 
      
 64 
     | 
    
         
            +
              free(p);
         
     | 
| 
      
 65 
     | 
    
         
            +
            }
         
     | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
      
 67 
     | 
    
         
            +
            void ts_reset(TokenStream *ts, char *text)
         
     | 
| 
      
 68 
     | 
    
         
            +
            {
         
     | 
| 
      
 69 
     | 
    
         
            +
              ts->text = text;
         
     | 
| 
      
 70 
     | 
    
         
            +
              ts->pos = 0;
         
     | 
| 
      
 71 
     | 
    
         
            +
            }
         
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
      
 73 
     | 
    
         
            +
            TokenStream *ts_create()
         
     | 
| 
      
 74 
     | 
    
         
            +
            {
         
     | 
| 
      
 75 
     | 
    
         
            +
              TokenStream *ts = ALLOC(TokenStream);
         
     | 
| 
      
 76 
     | 
    
         
            +
              ts->pos = -1;
         
     | 
| 
      
 77 
     | 
    
         
            +
              ts->text = NULL;
         
     | 
| 
      
 78 
     | 
    
         
            +
              ts->token = tk_create();
         
     | 
| 
      
 79 
     | 
    
         
            +
              ts->destroy = &ts_standard_destroy;
         
     | 
| 
      
 80 
     | 
    
         
            +
              ts->reset = &ts_reset;
         
     | 
| 
      
 81 
     | 
    
         
            +
              return ts;
         
     | 
| 
      
 82 
     | 
    
         
            +
            }
         
     | 
| 
      
 83 
     | 
    
         
            +
             
     | 
| 
      
 84 
     | 
    
         
            +
            Token *wst_next(TokenStream *ts)
         
     | 
| 
      
 85 
     | 
    
         
            +
            {
         
     | 
| 
      
 86 
     | 
    
         
            +
              int i = ts->pos;
         
     | 
| 
      
 87 
     | 
    
         
            +
              int start, end;
         
     | 
| 
      
 88 
     | 
    
         
            +
              char *text = ts->text;
         
     | 
| 
      
 89 
     | 
    
         
            +
             
     | 
| 
      
 90 
     | 
    
         
            +
              while (text[i] != '\0' && isspace(text[i]))
         
     | 
| 
      
 91 
     | 
    
         
            +
                i++;
         
     | 
| 
      
 92 
     | 
    
         
            +
              if (text[i] == '\0')
         
     | 
| 
      
 93 
     | 
    
         
            +
                return NULL;
         
     | 
| 
      
 94 
     | 
    
         
            +
             
     | 
| 
      
 95 
     | 
    
         
            +
              start = i;
         
     | 
| 
      
 96 
     | 
    
         
            +
              while (text[i] != '\0' && !isspace(text[i]))
         
     | 
| 
      
 97 
     | 
    
         
            +
                i++;
         
     | 
| 
      
 98 
     | 
    
         
            +
              ts->pos = end = i;
         
     | 
| 
      
 99 
     | 
    
         
            +
              tk_set(ts->token, text+start, end-start, start, end, 1);
         
     | 
| 
      
 100 
     | 
    
         
            +
              return ts->token;
         
     | 
| 
      
 101 
     | 
    
         
            +
            }
         
     | 
| 
      
 102 
     | 
    
         
            +
             
     | 
| 
      
 103 
     | 
    
         
            +
            TokenStream *whitespace_tokenizer_create()
         
     | 
| 
      
 104 
     | 
    
         
            +
            {
         
     | 
| 
      
 105 
     | 
    
         
            +
              TokenStream *ts = ts_create();
         
     | 
| 
      
 106 
     | 
    
         
            +
              ts->next = &wst_next;
         
     | 
| 
      
 107 
     | 
    
         
            +
              return ts;
         
     | 
| 
      
 108 
     | 
    
         
            +
            }
         
     | 
| 
      
 109 
     | 
    
         
            +
             
     | 
| 
      
 110 
     | 
    
         
            +
            Token *lt_next(TokenStream *ts)
         
     | 
| 
      
 111 
     | 
    
         
            +
            {
         
     | 
| 
      
 112 
     | 
    
         
            +
              int i = ts->pos;
         
     | 
| 
      
 113 
     | 
    
         
            +
              int start, end;
         
     | 
| 
      
 114 
     | 
    
         
            +
              char *text = ts->text;
         
     | 
| 
      
 115 
     | 
    
         
            +
             
     | 
| 
      
 116 
     | 
    
         
            +
              while (text[i] != '\0' && !isalpha(text[i]))
         
     | 
| 
      
 117 
     | 
    
         
            +
                i++;
         
     | 
| 
      
 118 
     | 
    
         
            +
              if (text[i] == '\0')
         
     | 
| 
      
 119 
     | 
    
         
            +
                return NULL;
         
     | 
| 
      
 120 
     | 
    
         
            +
             
     | 
| 
      
 121 
     | 
    
         
            +
              start = i;
         
     | 
| 
      
 122 
     | 
    
         
            +
              while (text[i] != '\0' && isalpha(text[i]))
         
     | 
| 
      
 123 
     | 
    
         
            +
                i++;
         
     | 
| 
      
 124 
     | 
    
         
            +
              ts->pos = end = i;
         
     | 
| 
      
 125 
     | 
    
         
            +
              tk_set(ts->token, text+start, end-start, start, end, 1);
         
     | 
| 
      
 126 
     | 
    
         
            +
              return ts->token;
         
     | 
| 
      
 127 
     | 
    
         
            +
            }
         
     | 
| 
      
 128 
     | 
    
         
            +
             
     | 
| 
      
 129 
     | 
    
         
            +
            TokenStream *letter_tokenizer_create()
         
     | 
| 
      
 130 
     | 
    
         
            +
            {
         
     | 
| 
      
 131 
     | 
    
         
            +
              TokenStream *ts = ts_create();
         
     | 
| 
      
 132 
     | 
    
         
            +
              ts->next = <_next;
         
     | 
| 
      
 133 
     | 
    
         
            +
              return ts;
         
     | 
| 
      
 134 
     | 
    
         
            +
            }
         
     | 
| 
      
 135 
     | 
    
         
            +
             
     | 
| 
      
 136 
     | 
    
         
            +
            void a_standard_destroy(void *p)
         
     | 
| 
      
 137 
     | 
    
         
            +
            {
         
     | 
| 
      
 138 
     | 
    
         
            +
              Analyzer *a = (Analyzer *)p;
         
     | 
| 
      
 139 
     | 
    
         
            +
              ts_destroy(a->current_ts);
         
     | 
| 
      
 140 
     | 
    
         
            +
              free(p);
         
     | 
| 
      
 141 
     | 
    
         
            +
            }
         
     | 
| 
      
 142 
     | 
    
         
            +
             
     | 
| 
      
 143 
     | 
    
         
            +
            TokenStream *a_standard_get_ts(Analyzer *a, char *field, char *text)
         
     | 
| 
      
 144 
     | 
    
         
            +
            {
         
     | 
| 
      
 145 
     | 
    
         
            +
              a->current_ts->reset(a->current_ts, text);
         
     | 
| 
      
 146 
     | 
    
         
            +
              return a->current_ts; 
         
     | 
| 
      
 147 
     | 
    
         
            +
            }
         
     | 
| 
      
 148 
     | 
    
         
            +
             
     | 
| 
      
 149 
     | 
    
         
            +
            Analyzer *whitespace_analyzer_create()
         
     | 
| 
      
 150 
     | 
    
         
            +
            {
         
     | 
| 
      
 151 
     | 
    
         
            +
              Analyzer *a = ALLOC(Analyzer);
         
     | 
| 
      
 152 
     | 
    
         
            +
              a->data = NULL;
         
     | 
| 
      
 153 
     | 
    
         
            +
              a->current_ts = whitespace_tokenizer_create();
         
     | 
| 
      
 154 
     | 
    
         
            +
              a->destroy = &a_standard_destroy;
         
     | 
| 
      
 155 
     | 
    
         
            +
              a->get_ts = &a_standard_get_ts;
         
     | 
| 
      
 156 
     | 
    
         
            +
              return a;
         
     | 
| 
      
 157 
     | 
    
         
            +
            }
         
     | 
| 
      
 158 
     | 
    
         
            +
             
     | 
| 
      
 159 
     | 
    
         
            +
            int std_get_alpha(char *input, char *token)
         
     | 
| 
      
 160 
     | 
    
         
            +
            {
         
     | 
| 
      
 161 
     | 
    
         
            +
              int i = 0;
         
     | 
| 
      
 162 
     | 
    
         
            +
              while (input[i] != '\0' && isalpha(input[i])) {
         
     | 
| 
      
 163 
     | 
    
         
            +
                token[i] = input[i];
         
     | 
| 
      
 164 
     | 
    
         
            +
                i++;
         
     | 
| 
      
 165 
     | 
    
         
            +
              }
         
     | 
| 
      
 166 
     | 
    
         
            +
              return i;
         
     | 
| 
      
 167 
     | 
    
         
            +
            }
         
     | 
| 
      
 168 
     | 
    
         
            +
             
     | 
| 
      
 169 
     | 
    
         
            +
            int std_get_alnum(char *input, char *token)
         
     | 
| 
      
 170 
     | 
    
         
            +
            {
         
     | 
| 
      
 171 
     | 
    
         
            +
              int i = 0;
         
     | 
| 
      
 172 
     | 
    
         
            +
              while (input[i] != '\0' && isalnum(input[i])) {
         
     | 
| 
      
 173 
     | 
    
         
            +
                token[i] = input[i];
         
     | 
| 
      
 174 
     | 
    
         
            +
                i++;
         
     | 
| 
      
 175 
     | 
    
         
            +
              }
         
     | 
| 
      
 176 
     | 
    
         
            +
              return i;
         
     | 
| 
      
 177 
     | 
    
         
            +
            }
         
     | 
| 
      
 178 
     | 
    
         
            +
             
     | 
| 
      
 179 
     | 
    
         
            +
            int isnumpunc(char c)
         
     | 
| 
      
 180 
     | 
    
         
            +
            {
         
     | 
| 
      
 181 
     | 
    
         
            +
              return (c == '.' || c == ',' || c == '\\' || c == '/' || c == '_' || c == '-');
         
     | 
| 
      
 182 
     | 
    
         
            +
            }
         
     | 
| 
      
 183 
     | 
    
         
            +
             
     | 
| 
      
 184 
     | 
    
         
            +
            int isurlpunc(char c)
         
     | 
| 
      
 185 
     | 
    
         
            +
            {
         
     | 
| 
      
 186 
     | 
    
         
            +
              return (c == '.' || c == '/' || c == '-' || c == '_');
         
     | 
| 
      
 187 
     | 
    
         
            +
            }
         
     | 
| 
      
 188 
     | 
    
         
            +
             
     | 
| 
      
 189 
     | 
    
         
            +
            int isurlc(char c)
         
     | 
| 
      
 190 
     | 
    
         
            +
            {
         
     | 
| 
      
 191 
     | 
    
         
            +
              return (c == '.' || c == '/' || c == '-' || c == '_' || isalnum(c));
         
     | 
| 
      
 192 
     | 
    
         
            +
            }
         
     | 
| 
      
 193 
     | 
    
         
            +
             
     | 
| 
      
 194 
     | 
    
         
            +
            int isurlxatpunc(char c)
         
     | 
| 
      
 195 
     | 
    
         
            +
            {
         
     | 
| 
      
 196 
     | 
    
         
            +
              return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@');
         
     | 
| 
      
 197 
     | 
    
         
            +
            }
         
     | 
| 
      
 198 
     | 
    
         
            +
             
     | 
| 
      
 199 
     | 
    
         
            +
            int isurlxatc(char c)
         
     | 
| 
      
 200 
     | 
    
         
            +
            {
         
     | 
| 
      
 201 
     | 
    
         
            +
              return (c == '.' || c == '/' || c == '-' || c == '_' || c == '@' || isalnum(c));
         
     | 
| 
      
 202 
     | 
    
         
            +
            }
         
     | 
| 
      
 203 
     | 
    
         
            +
             
     | 
| 
      
 204 
     | 
    
         
            +
            int isstdtokchar(char c)
         
     | 
| 
      
 205 
     | 
    
         
            +
            {
         
     | 
| 
      
 206 
     | 
    
         
            +
              if (isspace(c)) return false; // most common so check first.
         
     | 
| 
      
 207 
     | 
    
         
            +
              if (isalnum(c) || isnumpunc(c) || c == '&' ||
         
     | 
| 
      
 208 
     | 
    
         
            +
                  c == '@' || c == '\'' || c == ':')
         
     | 
| 
      
 209 
     | 
    
         
            +
                return true;
         
     | 
| 
      
 210 
     | 
    
         
            +
              return false;
         
     | 
| 
      
 211 
     | 
    
         
            +
            }
         
     | 
| 
      
 212 
     | 
    
         
            +
             
     | 
| 
      
 213 
     | 
    
         
            +
            /* (alnum)((punc)(alnum))+ where every second sequence of alnum must contain at
         
     | 
| 
      
 214 
     | 
    
         
            +
             * least one digit.
         
     | 
| 
      
 215 
     | 
    
         
            +
             * (alnum) = [a-zA-Z0-9]
         
     | 
| 
      
 216 
     | 
    
         
            +
             * (punc) = [_\/.,-]
         
     | 
| 
      
 217 
     | 
    
         
            +
             */
         
     | 
| 
      
 218 
     | 
    
         
            +
            int std_get_number(char *input)
         
     | 
| 
      
 219 
     | 
    
         
            +
            {
         
     | 
| 
      
 220 
     | 
    
         
            +
              int i = 0;
         
     | 
| 
      
 221 
     | 
    
         
            +
              int count = 0;
         
     | 
| 
      
 222 
     | 
    
         
            +
              int last_seen_digit = 2;
         
     | 
| 
      
 223 
     | 
    
         
            +
              int seen_digit = false;
         
     | 
| 
      
 224 
     | 
    
         
            +
             
     | 
| 
      
 225 
     | 
    
         
            +
              while (last_seen_digit >= 0) {
         
     | 
| 
      
 226 
     | 
    
         
            +
                while ((input[i] != '\0') && isalnum(input[i])) {
         
     | 
| 
      
 227 
     | 
    
         
            +
                  if ((last_seen_digit < 2) && isdigit(input[i])) last_seen_digit = 2;
         
     | 
| 
      
 228 
     | 
    
         
            +
                  if ((seen_digit == false) && isdigit(input[i])) seen_digit = true;
         
     | 
| 
      
 229 
     | 
    
         
            +
                  i++;
         
     | 
| 
      
 230 
     | 
    
         
            +
                }
         
     | 
| 
      
 231 
     | 
    
         
            +
                last_seen_digit--;
         
     | 
| 
      
 232 
     | 
    
         
            +
                if (!isnumpunc(input[i]) || !isalnum(input[i+1])) {
         
     | 
| 
      
 233 
     | 
    
         
            +
                  
         
     | 
| 
      
 234 
     | 
    
         
            +
                  if (last_seen_digit >= 0)
         
     | 
| 
      
 235 
     | 
    
         
            +
                    count = i;
         
     | 
| 
      
 236 
     | 
    
         
            +
                  break;
         
     | 
| 
      
 237 
     | 
    
         
            +
                }
         
     | 
| 
      
 238 
     | 
    
         
            +
                count = i;
         
     | 
| 
      
 239 
     | 
    
         
            +
                i++;
         
     | 
| 
      
 240 
     | 
    
         
            +
              }
         
     | 
| 
      
 241 
     | 
    
         
            +
              if (seen_digit)
         
     | 
| 
      
 242 
     | 
    
         
            +
                return count;
         
     | 
| 
      
 243 
     | 
    
         
            +
              else
         
     | 
| 
      
 244 
     | 
    
         
            +
                return 0;
         
     | 
| 
      
 245 
     | 
    
         
            +
            }
         
     | 
| 
      
 246 
     | 
    
         
            +
             
     | 
| 
      
 247 
     | 
    
         
            +
            int std_get_apostrophe(char *input)
         
     | 
| 
      
 248 
     | 
    
         
            +
            {
         
     | 
| 
      
 249 
     | 
    
         
            +
              int i = 0;
         
     | 
| 
      
 250 
     | 
    
         
            +
             
     | 
| 
      
 251 
     | 
    
         
            +
              while (isalpha(input[i]) || input[i] == '\'')
         
     | 
| 
      
 252 
     | 
    
         
            +
                i++;
         
     | 
| 
      
 253 
     | 
    
         
            +
             
     | 
| 
      
 254 
     | 
    
         
            +
              return i;
         
     | 
| 
      
 255 
     | 
    
         
            +
            }
         
     | 
| 
      
 256 
     | 
    
         
            +
             
     | 
| 
      
 257 
     | 
    
         
            +
            int std_get_url(char *input, char *token)
         
     | 
| 
      
 258 
     | 
    
         
            +
            {
         
     | 
| 
      
 259 
     | 
    
         
            +
              int i = 0;
         
     | 
| 
      
 260 
     | 
    
         
            +
             
     | 
| 
      
 261 
     | 
    
         
            +
              while (isurlc(input[i])) {
         
     | 
| 
      
 262 
     | 
    
         
            +
                if (isurlpunc(input[i]) && isurlpunc(input[i-1]))
         
     | 
| 
      
 263 
     | 
    
         
            +
                  break; // can't have to puncs in a row
         
     | 
| 
      
 264 
     | 
    
         
            +
                token[i] = input[i];
         
     | 
| 
      
 265 
     | 
    
         
            +
                i++;
         
     | 
| 
      
 266 
     | 
    
         
            +
              }
         
     | 
| 
      
 267 
     | 
    
         
            +
             
     | 
| 
      
 268 
     | 
    
         
            +
              //strip trailing puncs
         
     | 
| 
      
 269 
     | 
    
         
            +
              while (isurlpunc(input[i-1])) i--;
         
     | 
| 
      
 270 
     | 
    
         
            +
             
     | 
| 
      
 271 
     | 
    
         
            +
              return i;
         
     | 
| 
      
 272 
     | 
    
         
            +
            }
         
     | 
| 
      
 273 
     | 
    
         
            +
             
     | 
| 
      
 274 
     | 
    
         
            +
            /* Company names can contain '@' and '&' like AT&T and Excite@Home. Let's
         
     | 
| 
      
 275 
     | 
    
         
            +
             */
         
     | 
| 
      
 276 
     | 
    
         
            +
            int std_get_company_name(char *input)
         
     | 
| 
      
 277 
     | 
    
         
            +
            {
         
     | 
| 
      
 278 
     | 
    
         
            +
              int i = 0;
         
     | 
| 
      
 279 
     | 
    
         
            +
              while (isalpha(input[i]) || input[i] == '@' || input[i] == '&')
         
     | 
| 
      
 280 
     | 
    
         
            +
                i++;
         
     | 
| 
      
 281 
     | 
    
         
            +
             
     | 
| 
      
 282 
     | 
    
         
            +
              return i;
         
     | 
| 
      
 283 
     | 
    
         
            +
            }
         
     | 
| 
      
 284 
     | 
    
         
            +
             
     | 
| 
      
 285 
     | 
    
         
            +
            Token *std_next(TokenStream *ts)
         
     | 
| 
      
 286 
     | 
    
         
            +
            {
         
     | 
| 
      
 287 
     | 
    
         
            +
              int i = ts->pos, j;
         
     | 
| 
      
 288 
     | 
    
         
            +
              int start;
         
     | 
| 
      
 289 
     | 
    
         
            +
              char *text = ts->text;
         
     | 
| 
      
 290 
     | 
    
         
            +
              char token[MAX_WORD_SIZE];
         
     | 
| 
      
 291 
     | 
    
         
            +
              int token_i = 0;
         
     | 
| 
      
 292 
     | 
    
         
            +
              int len;
         
     | 
| 
      
 293 
     | 
    
         
            +
              int num_end = 0;
         
     | 
| 
      
 294 
     | 
    
         
            +
              int is_acronym;
         
     | 
| 
      
 295 
     | 
    
         
            +
              int seen_at_symbol;
         
     | 
| 
      
 296 
     | 
    
         
            +
              
         
     | 
| 
      
 297 
     | 
    
         
            +
              while (text[i] != '\0' && !isalnum(text[i]))
         
     | 
| 
      
 298 
     | 
    
         
            +
                i++;
         
     | 
| 
      
 299 
     | 
    
         
            +
              if (text[i] == '\0')
         
     | 
| 
      
 300 
     | 
    
         
            +
                return NULL;
         
     | 
| 
      
 301 
     | 
    
         
            +
             
     | 
| 
      
 302 
     | 
    
         
            +
              start = i;
         
     | 
| 
      
 303 
     | 
    
         
            +
              if (isdigit(text[i])) {
         
     | 
| 
      
 304 
     | 
    
         
            +
                i += std_get_number(text + i);
         
     | 
| 
      
 305 
     | 
    
         
            +
                ts->pos = i;
         
     | 
| 
      
 306 
     | 
    
         
            +
                tk_set(ts->token, text+start, i - start, start, ts->pos, 1);
         
     | 
| 
      
 307 
     | 
    
         
            +
              } else {
         
     | 
| 
      
 308 
     | 
    
         
            +
                token_i = std_get_alpha(text + i, token);
         
     | 
| 
      
 309 
     | 
    
         
            +
                i += token_i;
         
     | 
| 
      
 310 
     | 
    
         
            +
             
     | 
| 
      
 311 
     | 
    
         
            +
                if (!isstdtokchar(text[i])) {
         
     | 
| 
      
 312 
     | 
    
         
            +
                  // very common case, ie a plain word, so check and return
         
     | 
| 
      
 313 
     | 
    
         
            +
                  tk_set(ts->token, text+start, i-start, start, i, 1);
         
     | 
| 
      
 314 
     | 
    
         
            +
                  ts->pos = i;
         
     | 
| 
      
 315 
     | 
    
         
            +
                  return ts->token;
         
     | 
| 
      
 316 
     | 
    
         
            +
                }
         
     | 
| 
      
 317 
     | 
    
         
            +
             
     | 
| 
      
 318 
     | 
    
         
            +
                if (text[i] == '\'') { // apostrophe case. 
         
     | 
| 
      
 319 
     | 
    
         
            +
                  i += std_get_apostrophe(text + i);
         
     | 
| 
      
 320 
     | 
    
         
            +
                  ts->pos = i;
         
     | 
| 
      
 321 
     | 
    
         
            +
                  len = i - start;
         
     | 
| 
      
 322 
     | 
    
         
            +
                  // strip possesive
         
     | 
| 
      
 323 
     | 
    
         
            +
                  if ((text[i-1] == 's' || text[i-1] == 'S') && text[i-2] == '\'')
         
     | 
| 
      
 324 
     | 
    
         
            +
                    len -= 2;
         
     | 
| 
      
 325 
     | 
    
         
            +
                  tk_set(ts->token, text+start, len, start, i, 1);
         
     | 
| 
      
 326 
     | 
    
         
            +
                  return ts->token;
         
     | 
| 
      
 327 
     | 
    
         
            +
                }
         
     | 
| 
      
 328 
     | 
    
         
            +
                if (text[i] == '&') { // apostrophe case. 
         
     | 
| 
      
 329 
     | 
    
         
            +
                  i += std_get_company_name(text + i);
         
     | 
| 
      
 330 
     | 
    
         
            +
                  ts->pos = i;
         
     | 
| 
      
 331 
     | 
    
         
            +
                  tk_set(ts->token, text+start, i - start, start, i, 1);
         
     | 
| 
      
 332 
     | 
    
         
            +
                  return ts->token;
         
     | 
| 
      
 333 
     | 
    
         
            +
                }
         
     | 
| 
      
 334 
     | 
    
         
            +
             
     | 
| 
      
 335 
     | 
    
         
            +
                if (isdigit(text[i]) || isnumpunc(text[i])) { // possibly a number
         
     | 
| 
      
 336 
     | 
    
         
            +
                  num_end = start + std_get_number(text + start);
         
     | 
| 
      
 337 
     | 
    
         
            +
                  if (!isstdtokchar(text[num_end])) { // we won't find a longer token
         
     | 
| 
      
 338 
     | 
    
         
            +
                    ts->pos = num_end;
         
     | 
| 
      
 339 
     | 
    
         
            +
                    tk_set(ts->token, text+start, num_end-start, start, ts->pos, 1);
         
     | 
| 
      
 340 
     | 
    
         
            +
                    return ts->token;
         
     | 
| 
      
 341 
     | 
    
         
            +
                  }
         
     | 
| 
      
 342 
     | 
    
         
            +
                  // else there may be a longer token so check
         
     | 
| 
      
 343 
     | 
    
         
            +
                }
         
     | 
| 
      
 344 
     | 
    
         
            +
             
     | 
| 
      
 345 
     | 
    
         
            +
                if (text[i] == ':' && text[i+1] == '/' && text[i+2] == '/') {
         
     | 
| 
      
 346 
     | 
    
         
            +
                  // check for a known url start
         
     | 
| 
      
 347 
     | 
    
         
            +
                  token[token_i] = '\0';
         
     | 
| 
      
 348 
     | 
    
         
            +
                  i += 3;
         
     | 
| 
      
 349 
     | 
    
         
            +
                  while (text[i] == '/') i++;
         
     | 
| 
      
 350 
     | 
    
         
            +
                  if (isalpha(text[i]) &&
         
     | 
| 
      
 351 
     | 
    
         
            +
                      (strcmp(token, "ftp") == 0 ||
         
     | 
| 
      
 352 
     | 
    
         
            +
                       strcmp(token, "http") == 0 ||
         
     | 
| 
      
 353 
     | 
    
         
            +
                       strcmp(token, "https") == 0 ||
         
     | 
| 
      
 354 
     | 
    
         
            +
                       strcmp(token, "file") == 0)) {
         
     | 
| 
      
 355 
     | 
    
         
            +
                    len = std_get_url(text + i, token); // dispose of first part of the URL
         
     | 
| 
      
 356 
     | 
    
         
            +
                  } else { //still treat as url but keep the first part
         
     | 
| 
      
 357 
     | 
    
         
            +
                    token_i = i - start;
         
     | 
| 
      
 358 
     | 
    
         
            +
                    memcpy(token, text + start, token_i * sizeof(char));
         
     | 
| 
      
 359 
     | 
    
         
            +
                    len = token_i + std_get_url(text + i, token + token_i); // keep start
         
     | 
| 
      
 360 
     | 
    
         
            +
                  }
         
     | 
| 
      
 361 
     | 
    
         
            +
                  ts->pos = i + len;
         
     | 
| 
      
 362 
     | 
    
         
            +
                  token[len] = 0;
         
     | 
| 
      
 363 
     | 
    
         
            +
                  tk_set(ts->token, token, len, start, ts->pos, 1);
         
     | 
| 
      
 364 
     | 
    
         
            +
                  return ts->token;
         
     | 
| 
      
 365 
     | 
    
         
            +
                }
         
     | 
| 
      
 366 
     | 
    
         
            +
             
     | 
| 
      
 367 
     | 
    
         
            +
                // now see how int a url we can find.
         
     | 
| 
      
 368 
     | 
    
         
            +
                is_acronym = true;
         
     | 
| 
      
 369 
     | 
    
         
            +
                seen_at_symbol = false;
         
     | 
| 
      
 370 
     | 
    
         
            +
                while (isurlxatc(text[i])) {
         
     | 
| 
      
 371 
     | 
    
         
            +
                  if (is_acronym && !isalpha(text[i]) && (text[i] != '.')) {
         
     | 
| 
      
 372 
     | 
    
         
            +
                    is_acronym = false;
         
     | 
| 
      
 373 
     | 
    
         
            +
                  }
         
     | 
| 
      
 374 
     | 
    
         
            +
                  if (isurlxatpunc(text[i]) && isurlxatpunc(text[i-1]))
         
     | 
| 
      
 375 
     | 
    
         
            +
                    break; // can't have to punctuation characters in a row
         
     | 
| 
      
 376 
     | 
    
         
            +
                  if (text[i] == '@') {
         
     | 
| 
      
 377 
     | 
    
         
            +
                    if (seen_at_symbol)
         
     | 
| 
      
 378 
     | 
    
         
            +
                      break; // we can only have one @ symbol
         
     | 
| 
      
 379 
     | 
    
         
            +
                    else
         
     | 
| 
      
 380 
     | 
    
         
            +
                      seen_at_symbol = true;
         
     | 
| 
      
 381 
     | 
    
         
            +
                  }
         
     | 
| 
      
 382 
     | 
    
         
            +
                  i++;
         
     | 
| 
      
 383 
     | 
    
         
            +
                }
         
     | 
| 
      
 384 
     | 
    
         
            +
                while (isurlxatpunc(text[i-1])) i--; // strip trailing punctuation
         
     | 
| 
      
 385 
     | 
    
         
            +
                if (i > num_end) {
         
     | 
| 
      
 386 
     | 
    
         
            +
                  ts->pos = i;
         
     | 
| 
      
 387 
     | 
    
         
            +
             
     | 
| 
      
 388 
     | 
    
         
            +
                  if (is_acronym) { // check that it is one letter followed by one '.'
         
     | 
| 
      
 389 
     | 
    
         
            +
                    for (j = start; j < i-1; j++) {
         
     | 
| 
      
 390 
     | 
    
         
            +
                      if (isalpha(text[j]) && (text[j+1] != '.')) is_acronym = false;
         
     | 
| 
      
 391 
     | 
    
         
            +
                    }
         
     | 
| 
      
 392 
     | 
    
         
            +
                  }
         
     | 
| 
      
 393 
     | 
    
         
            +
                  if (is_acronym) {// strip '.'s
         
     | 
| 
      
 394 
     | 
    
         
            +
                    for (j = start + token_i; j < i; j++) {
         
     | 
| 
      
 395 
     | 
    
         
            +
                      if (text[j] != '.') {
         
     | 
| 
      
 396 
     | 
    
         
            +
                        token[token_i] = text[j];
         
     | 
| 
      
 397 
     | 
    
         
            +
                        token_i++;
         
     | 
| 
      
 398 
     | 
    
         
            +
                      }
         
     | 
| 
      
 399 
     | 
    
         
            +
                    }
         
     | 
| 
      
 400 
     | 
    
         
            +
                    tk_set(ts->token, token, token_i, start, ts->pos, 1);
         
     | 
| 
      
 401 
     | 
    
         
            +
                  } else { // just return the url as is
         
     | 
| 
      
 402 
     | 
    
         
            +
                    tk_set(ts->token, text+start, i-start, start, ts->pos, 1);
         
     | 
| 
      
 403 
     | 
    
         
            +
                  }
         
     | 
| 
      
 404 
     | 
    
         
            +
                } else { // return the number
         
     | 
| 
      
 405 
     | 
    
         
            +
                  ts->pos = num_end;
         
     | 
| 
      
 406 
     | 
    
         
            +
                  tk_set(ts->token, text+start, num_end-start, start, ts->pos, 1);
         
     | 
| 
      
 407 
     | 
    
         
            +
                }
         
     | 
| 
      
 408 
     | 
    
         
            +
              }
         
     | 
| 
      
 409 
     | 
    
         
            +
             
     | 
| 
      
 410 
     | 
    
         
            +
              return ts->token;
         
     | 
| 
      
 411 
     | 
    
         
            +
            }
         
     | 
| 
      
 412 
     | 
    
         
            +
             
     | 
| 
      
 413 
     | 
    
         
            +
            TokenStream *standard_tokenizer_create()
         
     | 
| 
      
 414 
     | 
    
         
            +
            {
         
     | 
| 
      
 415 
     | 
    
         
            +
              TokenStream *ts = ts_create();
         
     | 
| 
      
 416 
     | 
    
         
            +
              ts->next = &std_next;
         
     | 
| 
      
 417 
     | 
    
         
            +
              return ts;
         
     | 
| 
      
 418 
     | 
    
         
            +
            }
         
     | 
| 
      
 419 
     | 
    
         
            +
             
     | 
| 
      
 420 
     | 
    
         
            +
            const char *ENGLISH_STOP_WORDS[] = {
         
     | 
| 
      
 421 
     | 
    
         
            +
              "a", "an", "and", "are", "as", "at", "be", "but", "by",
         
     | 
| 
      
 422 
     | 
    
         
            +
              "for", "if", "in", "into", "is", "it",
         
     | 
| 
      
 423 
     | 
    
         
            +
              "no", "not", "of", "on", "or", "s", "such",
         
     | 
| 
      
 424 
     | 
    
         
            +
              "t", "that", "the", "their", "then", "there", "these",
         
     | 
| 
      
 425 
     | 
    
         
            +
              "they", "this", "to", "was", "will", "with"
         
     | 
| 
      
 426 
     | 
    
         
            +
            };
         
     | 
| 
      
 427 
     | 
    
         
            +
             
     | 
| 
      
 428 
     | 
    
         
            +
            void filter_reset(TokenStream *ts, char *text)
         
     | 
| 
      
 429 
     | 
    
         
            +
            {
         
     | 
| 
      
 430 
     | 
    
         
            +
              ts->sub_ts->reset(ts->sub_ts, text);
         
     | 
| 
      
 431 
     | 
    
         
            +
            }
         
     | 
| 
      
 432 
     | 
    
         
            +
             
     | 
| 
      
 433 
     | 
    
         
            +
            void filter_destroy(void *p)
         
     | 
| 
      
 434 
     | 
    
         
            +
            {
         
     | 
| 
      
 435 
     | 
    
         
            +
              TokenStream *ts = (TokenStream *)p;
         
     | 
| 
      
 436 
     | 
    
         
            +
              ts->sub_ts->destroy(ts->sub_ts);
         
     | 
| 
      
 437 
     | 
    
         
            +
              if (ts->token != NULL) tk_destroy(ts->token);
         
     | 
| 
      
 438 
     | 
    
         
            +
              free(ts);
         
     | 
| 
      
 439 
     | 
    
         
            +
            }
         
     | 
| 
      
 440 
     | 
    
         
            +
             
     | 
| 
      
 441 
     | 
    
         
            +
            void sf_destroy(void *p)
         
     | 
| 
      
 442 
     | 
    
         
            +
            {
         
     | 
| 
      
 443 
     | 
    
         
            +
              HshTable *words = (HshTable *)((TokenStream *)p)->data;
         
     | 
| 
      
 444 
     | 
    
         
            +
              h_destroy(words);
         
     | 
| 
      
 445 
     | 
    
         
            +
              filter_destroy(p);
         
     | 
| 
      
 446 
     | 
    
         
            +
            }
         
     | 
| 
      
 447 
     | 
    
         
            +
             
     | 
| 
      
 448 
     | 
    
         
            +
            Token *sf_next(TokenStream *ts)
         
     | 
| 
      
 449 
     | 
    
         
            +
            {
         
     | 
| 
      
 450 
     | 
    
         
            +
              int pos_inc = 1;
         
     | 
| 
      
 451 
     | 
    
         
            +
              HshTable *words = (HshTable *)ts->data;
         
     | 
| 
      
 452 
     | 
    
         
            +
              Token *tk = ts->sub_ts->next(ts->sub_ts);
         
     | 
| 
      
 453 
     | 
    
         
            +
              while ((tk != NULL) && (h_get(words, tk->text) != NULL)) {
         
     | 
| 
      
 454 
     | 
    
         
            +
                tk = ts->sub_ts->next(ts->sub_ts);
         
     | 
| 
      
 455 
     | 
    
         
            +
                pos_inc++;
         
     | 
| 
      
 456 
     | 
    
         
            +
              }
         
     | 
| 
      
 457 
     | 
    
         
            +
              if (tk != NULL) tk->pos_inc = pos_inc;
         
     | 
| 
      
 458 
     | 
    
         
            +
              return tk;
         
     | 
| 
      
 459 
     | 
    
         
            +
            }
         
     | 
| 
      
 460 
     | 
    
         
            +
             
     | 
| 
      
 461 
     | 
    
         
            +
            TokenStream *stop_filter_create_with_words(TokenStream *ts, char **words, int len)
         
     | 
| 
      
 462 
     | 
    
         
            +
            {
         
     | 
| 
      
 463 
     | 
    
         
            +
              int i;
         
     | 
| 
      
 464 
     | 
    
         
            +
              TokenStream *tf = ALLOC(TokenStream);
         
     | 
| 
      
 465 
     | 
    
         
            +
              tf->sub_ts = ts;
         
     | 
| 
      
 466 
     | 
    
         
            +
              HshTable *wordtable = h_new_str(NULL, NULL);
         
     | 
| 
      
 467 
     | 
    
         
            +
              for (i = 0; i < len; i++) {
         
     | 
| 
      
 468 
     | 
    
         
            +
                h_set(wordtable, words[i], words[i]);
         
     | 
| 
      
 469 
     | 
    
         
            +
              }
         
     | 
| 
      
 470 
     | 
    
         
            +
              tf->data = wordtable;
         
     | 
| 
      
 471 
     | 
    
         
            +
              tf->token = NULL;
         
     | 
| 
      
 472 
     | 
    
         
            +
              tf->next = &sf_next;
         
     | 
| 
      
 473 
     | 
    
         
            +
              tf->reset = &filter_reset;
         
     | 
| 
      
 474 
     | 
    
         
            +
              tf->destroy = &sf_destroy;
         
     | 
| 
      
 475 
     | 
    
         
            +
              return tf;
         
     | 
| 
      
 476 
     | 
    
         
            +
            }
         
     | 
| 
      
 477 
     | 
    
         
            +
             
     | 
| 
      
 478 
     | 
    
         
            +
            TokenStream *stop_filter_create(TokenStream *ts)
         
     | 
| 
      
 479 
     | 
    
         
            +
            {
         
     | 
| 
      
 480 
     | 
    
         
            +
              return stop_filter_create_with_words(ts, 
         
     | 
| 
      
 481 
     | 
    
         
            +
                  (char **)ENGLISH_STOP_WORDS, NELEMS(ENGLISH_STOP_WORDS));
         
     | 
| 
      
 482 
     | 
    
         
            +
            }
         
     | 
| 
      
 483 
     | 
    
         
            +
             
     | 
| 
      
 484 
     | 
    
         
            +
            Token *lcf_next(TokenStream *ts)
         
     | 
| 
      
 485 
     | 
    
         
            +
            {
         
     | 
| 
      
 486 
     | 
    
         
            +
              int i = 0;
         
     | 
| 
      
 487 
     | 
    
         
            +
              Token *tk = ts->sub_ts->next(ts->sub_ts);
         
     | 
| 
      
 488 
     | 
    
         
            +
              if (tk == NULL) return tk;
         
     | 
| 
      
 489 
     | 
    
         
            +
              while (tk->text[i] != '\0') {
         
     | 
| 
      
 490 
     | 
    
         
            +
                tk->text[i] = tolower(tk->text[i]);
         
     | 
| 
      
 491 
     | 
    
         
            +
                i++;
         
     | 
| 
      
 492 
     | 
    
         
            +
              }
         
     | 
| 
      
 493 
     | 
    
         
            +
              return tk;
         
     | 
| 
      
 494 
     | 
    
         
            +
            }
         
     | 
| 
      
 495 
     | 
    
         
            +
             
     | 
| 
      
 496 
     | 
    
         
            +
            TokenStream *lowercase_filter_create(TokenStream *ts)
         
     | 
| 
      
 497 
     | 
    
         
            +
            {
         
     | 
| 
      
 498 
     | 
    
         
            +
              TokenStream *tf = ALLOC(TokenStream);
         
     | 
| 
      
 499 
     | 
    
         
            +
              tf->token = NULL;
         
     | 
| 
      
 500 
     | 
    
         
            +
              tf->next = &lcf_next;
         
     | 
| 
      
 501 
     | 
    
         
            +
              tf->reset = &filter_reset;
         
     | 
| 
      
 502 
     | 
    
         
            +
              tf->destroy = &filter_destroy;
         
     | 
| 
      
 503 
     | 
    
         
            +
              tf->sub_ts = ts;
         
     | 
| 
      
 504 
     | 
    
         
            +
              return tf;
         
     | 
| 
      
 505 
     | 
    
         
            +
            }
         
     | 
| 
      
 506 
     | 
    
         
            +
             
     | 
| 
      
 507 
     | 
    
         
            +
            Analyzer *letter_analyzer_create()
         
     | 
| 
      
 508 
     | 
    
         
            +
            {
         
     | 
| 
      
 509 
     | 
    
         
            +
              Analyzer *a = ALLOC(Analyzer);
         
     | 
| 
      
 510 
     | 
    
         
            +
              a->data = NULL;
         
     | 
| 
      
 511 
     | 
    
         
            +
              a->current_ts = lowercase_filter_create(letter_tokenizer_create());
         
     | 
| 
      
 512 
     | 
    
         
            +
              a->destroy = &a_standard_destroy;
         
     | 
| 
      
 513 
     | 
    
         
            +
              a->get_ts = &a_standard_get_ts;
         
     | 
| 
      
 514 
     | 
    
         
            +
              return a;
         
     | 
| 
      
 515 
     | 
    
         
            +
            }
         
     | 
| 
      
 516 
     | 
    
         
            +
             
     | 
| 
      
 517 
     | 
    
         
            +
             
     | 
| 
      
 518 
     | 
    
         
            +
            Analyzer *standard_analyzer_create_with_words(char **words, int len)
         
     | 
| 
      
 519 
     | 
    
         
            +
            {
         
     | 
| 
      
 520 
     | 
    
         
            +
              Analyzer *a = ALLOC(Analyzer);
         
     | 
| 
      
 521 
     | 
    
         
            +
              a->data = NULL;
         
     | 
| 
      
 522 
     | 
    
         
            +
              a->current_ts = 
         
     | 
| 
      
 523 
     | 
    
         
            +
                stop_filter_create_with_words(
         
     | 
| 
      
 524 
     | 
    
         
            +
                    lowercase_filter_create(standard_tokenizer_create()), words, len);
         
     | 
| 
      
 525 
     | 
    
         
            +
              a->destroy = &a_standard_destroy;
         
     | 
| 
      
 526 
     | 
    
         
            +
              a->get_ts = &a_standard_get_ts;
         
     | 
| 
      
 527 
     | 
    
         
            +
              return a;
         
     | 
| 
      
 528 
     | 
    
         
            +
            }
         
     | 
| 
      
 529 
     | 
    
         
            +
             
     | 
| 
      
 530 
     | 
    
         
            +
            Analyzer *standard_analyzer_create()
         
     | 
| 
      
 531 
     | 
    
         
            +
            {
         
     | 
| 
      
 532 
     | 
    
         
            +
              return standard_analyzer_create_with_words(
         
     | 
| 
      
 533 
     | 
    
         
            +
                    (char **)ENGLISH_STOP_WORDS, NELEMS(ENGLISH_STOP_WORDS));
         
     | 
| 
      
 534 
     | 
    
         
            +
            }
         
     | 
| 
      
 535 
     | 
    
         
            +
             
     | 
| 
      
 536 
     | 
    
         
            +
            #ifdef ALONE
         
     | 
| 
      
 537 
     | 
    
         
            +
            int main(int argc, char **argv)
         
     | 
| 
      
 538 
     | 
    
         
            +
            {
         
     | 
| 
      
 539 
     | 
    
         
            +
              char buf[10000];
         
     | 
| 
      
 540 
     | 
    
         
            +
              Analyzer *a = standard_analyzer_create();
         
     | 
| 
      
 541 
     | 
    
         
            +
              TokenStream *ts;
         
     | 
| 
      
 542 
     | 
    
         
            +
              Token *tk;
         
     | 
| 
      
 543 
     | 
    
         
            +
              while (fgets(buf, 9999, stdin) != NULL) {
         
     | 
| 
      
 544 
     | 
    
         
            +
                ts = a->get_ts(a, "hello", buf);
         
     | 
| 
      
 545 
     | 
    
         
            +
                ts->pos = 0;
         
     | 
| 
      
 546 
     | 
    
         
            +
                while ((tk = ts->next(ts)) != NULL) {
         
     | 
| 
      
 547 
     | 
    
         
            +
                  printf("<%s:%ld:%ld> ", tk->text, tk->start, tk->end);
         
     | 
| 
      
 548 
     | 
    
         
            +
                }
         
     | 
| 
      
 549 
     | 
    
         
            +
                printf("\n");
         
     | 
| 
      
 550 
     | 
    
         
            +
              }
         
     | 
| 
      
 551 
     | 
    
         
            +
              return 0;
         
     | 
| 
      
 552 
     | 
    
         
            +
            }
         
     | 
| 
      
 553 
     | 
    
         
            +
            #endif
         
     |