uhferret 1.3.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/COPYING.txt +674 -0
- data/README.rdoc +79 -0
- data/bin/uhferret +129 -0
- data/bin/uhferret-server +68 -0
- data/ext/document.cpp +231 -0
- data/ext/document.h +89 -0
- data/ext/documentlist.cpp +229 -0
- data/ext/documentlist.h +80 -0
- data/ext/extconf.rb +2 -0
- data/ext/tokenreader.cpp +196 -0
- data/ext/tokenreader.h +85 -0
- data/ext/tokenset.cpp +111 -0
- data/ext/tokenset.h +73 -0
- data/ext/tupleset.cpp +150 -0
- data/ext/tupleset.h +92 -0
- data/ext/uhferret_lib_wrap.cxx +10726 -0
- data/lib/uhferret.rb +441 -0
- data/lib/utils.rb +93 -0
- data/lib/webferret.rb +246 -0
- metadata +71 -0
data/ext/tokenreader.h
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
#if !defined tokenreader_h
|
2
|
+
#define tokenreader_h
|
3
|
+
|
4
|
+
/**
|
5
|
+
* This file is part of uhferret.
|
6
|
+
*
|
7
|
+
* Author:: Peter Lane
|
8
|
+
* Copyright:: Copyright 2011, Peter Lane.
|
9
|
+
* License:: GPLv3
|
10
|
+
*
|
11
|
+
* uhferret is free software: you can redistribute it and/or modify
|
12
|
+
* it under the terms of the GNU General Public License as published by
|
13
|
+
* the Free Software Foundation, either version 3 of the License, or
|
14
|
+
* (at your option) any later version.
|
15
|
+
*
|
16
|
+
* uhferret is distributed in the hope that it will be useful,
|
17
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
19
|
+
* GNU General Public License for more details.
|
20
|
+
*
|
21
|
+
* You should have received a copy of the GNU General Public License
|
22
|
+
* along with uhferret. If not, see <http://www.gnu.org/licenses/>.
|
23
|
+
*/
|
24
|
+
|
25
|
+
#include <ctype.h> // gives tests for if characters are numbers, alphanumerics, etc
|
26
|
+
#include <istream>
|
27
|
+
|
28
|
+
#include "tokenset.h"
|
29
|
+
|
30
|
+
/** The TokenReader is the parent class of the different 'token-isers'
|
31
|
+
* -- WordReader tokenises a document into strings of alphanumeric characters
|
32
|
+
* -- CCodeReader tokenises a document into symbols matching a C-style language
|
33
|
+
* The token reader is initialised with an input stream
|
34
|
+
* -- GetToken is used to 'walk through' the document, one token at a time
|
35
|
+
* until IsFinished returns true.
|
36
|
+
* -- the start and end points of the token can be retrieved using the given methods,
|
37
|
+
* and the string making up the token can be obtained by caller from the TokenSet
|
38
|
+
*/
|
39
|
+
class TokenReader
|
40
|
+
{
|
41
|
+
public:
|
42
|
+
TokenReader (std::istream & input);
|
43
|
+
// return index of last read token
|
44
|
+
std::size_t GetToken (TokenSet & tokenset); // retrieve current token identifier
|
45
|
+
bool IsFinished () const; // return true if end-of-file reached
|
46
|
+
int GetTokenStart () const; // return the start position of current token
|
47
|
+
int GetTokenEnd () const; // return the end position of current token
|
48
|
+
// read token, return true if successful
|
49
|
+
// -- user of class must provide this method
|
50
|
+
virtual bool ReadToken () = 0;
|
51
|
+
protected: // allow subclasses to access parameters
|
52
|
+
std::istream & _input; // the stream from which to read
|
53
|
+
int _position; // current position in stream
|
54
|
+
Token _token; // last token read
|
55
|
+
int _token_start; // start position of last token read
|
56
|
+
char _look; // lookahead character
|
57
|
+
bool _done; // becomes true when stream is completed
|
58
|
+
};
|
59
|
+
|
60
|
+
// The WordReader separates its input stream into tokens, consisting of
|
61
|
+
// consecutive alphabetic characters
|
62
|
+
// -- every character is converted to lower case
|
63
|
+
class WordReader: public TokenReader
|
64
|
+
{
|
65
|
+
public:
|
66
|
+
WordReader (std::istream & input) : TokenReader (input) {}
|
67
|
+
bool IsAlphabetChar (char ch);
|
68
|
+
bool IsSingleCharWord (char ch);
|
69
|
+
bool ReadToken ();
|
70
|
+
};
|
71
|
+
|
72
|
+
// The CCodeReader separates its input stream into tokens, looking for
|
73
|
+
// C-style tokens, numbers and symbols
|
74
|
+
class CCodeReader: public TokenReader
|
75
|
+
{
|
76
|
+
public:
|
77
|
+
CCodeReader (std::istream & input) : TokenReader (input) {}
|
78
|
+
bool ReadToken ();
|
79
|
+
private:
|
80
|
+
bool IsSymbol (char c);
|
81
|
+
bool IsSymbol (std::string token, char c);
|
82
|
+
};
|
83
|
+
|
84
|
+
#endif
|
85
|
+
|
data/ext/tokenset.cpp
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
#include "tokenset.h"
|
2
|
+
|
3
|
+
/**
|
4
|
+
* This file is part of uhferret.
|
5
|
+
*
|
6
|
+
* Author:: Peter Lane
|
7
|
+
* Copyright:: Copyright 2011, Peter Lane.
|
8
|
+
* License:: GPLv3
|
9
|
+
*
|
10
|
+
* uhferret is free software: you can redistribute it and/or modify
|
11
|
+
* it under the terms of the GNU General Public License as published by
|
12
|
+
* the Free Software Foundation, either version 3 of the License, or
|
13
|
+
* (at your option) any later version.
|
14
|
+
*
|
15
|
+
* uhferret is distributed in the hope that it will be useful,
|
16
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
17
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
18
|
+
* GNU General Public License for more details.
|
19
|
+
*
|
20
|
+
* You should have received a copy of the GNU General Public License
|
21
|
+
* along with uhferret. If not, see <http://www.gnu.org/licenses/>.
|
22
|
+
*/
|
23
|
+
|
24
|
+
Token::Token ()
|
25
|
+
: _capacity (2), _top (0)
|
26
|
+
{
|
27
|
+
_token = new char [_capacity];
|
28
|
+
}
|
29
|
+
|
30
|
+
Token::~Token ()
|
31
|
+
{
|
32
|
+
delete[] _token;
|
33
|
+
}
|
34
|
+
|
35
|
+
void Token::Erase ()
|
36
|
+
{
|
37
|
+
_top = 0;
|
38
|
+
}
|
39
|
+
|
40
|
+
void Token::AddChar (char c)
|
41
|
+
{
|
42
|
+
if (_top == _capacity)
|
43
|
+
Grow ();
|
44
|
+
_token [_top] = c;
|
45
|
+
++_top;
|
46
|
+
}
|
47
|
+
|
48
|
+
std::string Token::GetString () const
|
49
|
+
{
|
50
|
+
return std::string (_token, _top);
|
51
|
+
}
|
52
|
+
|
53
|
+
int Token::GetLength () const
|
54
|
+
{
|
55
|
+
return _top;
|
56
|
+
}
|
57
|
+
|
58
|
+
void Token::Grow ()
|
59
|
+
{
|
60
|
+
char * newtoken = new char [2 * _capacity];
|
61
|
+
for (int i = 0; i < _capacity; ++i)
|
62
|
+
newtoken[i] = _token[i];
|
63
|
+
_capacity = 2 * _capacity;
|
64
|
+
delete _token;
|
65
|
+
_token = newtoken;
|
66
|
+
}
|
67
|
+
|
68
|
+
// *** TokenSet
|
69
|
+
|
70
|
+
TokenSet::TokenSet ()
|
71
|
+
: _nextindex (0)
|
72
|
+
{}
|
73
|
+
|
74
|
+
std::size_t TokenSet::GetIndexFor (std::string token)
|
75
|
+
{
|
76
|
+
_tokens_it = _tokens.find (token);
|
77
|
+
if (_tokens_it != _tokens.end()) // found it
|
78
|
+
return _tokens_it->second;
|
79
|
+
else // otherwise, make a new index
|
80
|
+
{
|
81
|
+
_tokens[token] = _nextindex;
|
82
|
+
_strings[_nextindex] = token;
|
83
|
+
_nextindex++;
|
84
|
+
return _nextindex-1;
|
85
|
+
}
|
86
|
+
}
|
87
|
+
|
88
|
+
std::string TokenSet::GetStringFor (std::size_t token)
|
89
|
+
{
|
90
|
+
_strings_it = _strings.find (token);
|
91
|
+
assert (_strings_it != _strings.end ()); // it's an error if token not in token set
|
92
|
+
return std::string (_strings_it->second.c_str ());
|
93
|
+
}
|
94
|
+
|
95
|
+
void TokenSet::Clear ()
|
96
|
+
{
|
97
|
+
_tokens.clear ();
|
98
|
+
_strings.clear ();
|
99
|
+
_nextindex = 0;
|
100
|
+
}
|
101
|
+
|
102
|
+
void TokenSet::SetNextIndex (int index)
|
103
|
+
{
|
104
|
+
_nextindex = index;
|
105
|
+
}
|
106
|
+
|
107
|
+
void TokenSet::SetIndexString (std::string token, int index)
|
108
|
+
{
|
109
|
+
_strings[index] = token;
|
110
|
+
_tokens[token] = index;
|
111
|
+
}
|
data/ext/tokenset.h
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
#if !defined tokenset_h
|
2
|
+
#define tokenset_h
|
3
|
+
|
4
|
+
/**
|
5
|
+
* This file is part of uhferret.
|
6
|
+
*
|
7
|
+
* Author:: Peter Lane
|
8
|
+
* Copyright:: Copyright 2011, Peter Lane.
|
9
|
+
* License:: GPLv3
|
10
|
+
*
|
11
|
+
* uhferret is free software: you can redistribute it and/or modify
|
12
|
+
* it under the terms of the GNU General Public License as published by
|
13
|
+
* the Free Software Foundation, either version 3 of the License, or
|
14
|
+
* (at your option) any later version.
|
15
|
+
*
|
16
|
+
* uhferret is distributed in the hope that it will be useful,
|
17
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
19
|
+
* GNU General Public License for more details.
|
20
|
+
*
|
21
|
+
* You should have received a copy of the GNU General Public License
|
22
|
+
* along with uhferret. If not, see <http://www.gnu.org/licenses/>.
|
23
|
+
*/
|
24
|
+
|
25
|
+
#include <assert.h>
|
26
|
+
#include <map>
|
27
|
+
#include <string>
|
28
|
+
#include <vector>
|
29
|
+
|
30
|
+
/** A Token is a sequence of characters read in by a TokenReader
|
31
|
+
* -- this class provides a dynamic storage for the token supporting
|
32
|
+
* addition of characters
|
33
|
+
* -- when finished, the token can be queried for its length and made into a string
|
34
|
+
*/
|
35
|
+
class Token
|
36
|
+
{
|
37
|
+
public:
|
38
|
+
Token ();
|
39
|
+
~Token ();
|
40
|
+
void Erase ();
|
41
|
+
void AddChar (char c);
|
42
|
+
std::string GetString () const;
|
43
|
+
int GetLength () const;
|
44
|
+
private:
|
45
|
+
void Grow ();
|
46
|
+
char * _token; // storage for the token
|
47
|
+
int _capacity; // size of the stored token
|
48
|
+
int _top; // pointer to end of token
|
49
|
+
};
|
50
|
+
|
51
|
+
/** A TokenSet maps strings to token indices
|
52
|
+
* -- this is for memory efficiency, ensuring every token's string is
|
53
|
+
* stored once within the application
|
54
|
+
*/
|
55
|
+
class TokenSet
|
56
|
+
{
|
57
|
+
public:
|
58
|
+
TokenSet ();
|
59
|
+
std::size_t GetIndexFor (std::string token);
|
60
|
+
std::string GetStringFor (std::size_t token);
|
61
|
+
void Clear ();
|
62
|
+
void SetNextIndex (int index);
|
63
|
+
void SetIndexString (std::string token, int index);
|
64
|
+
private:
|
65
|
+
std::map<std::string, std::size_t> _tokens;
|
66
|
+
std::map<std::string, std::size_t>::const_iterator _tokens_it;
|
67
|
+
std::size_t _nextindex; // next free index for new string
|
68
|
+
std::map<std::size_t, std::string> _strings;
|
69
|
+
std::map<std::size_t, std::string>::const_iterator _strings_it;
|
70
|
+
};
|
71
|
+
|
72
|
+
#endif
|
73
|
+
|
data/ext/tupleset.cpp
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
#include "tupleset.h"
|
2
|
+
|
3
|
+
/**
|
4
|
+
* This file is part of uhferret.
|
5
|
+
*
|
6
|
+
* Author:: Peter Lane
|
7
|
+
* Copyright:: Copyright 2011, Peter Lane.
|
8
|
+
* License:: GPLv3
|
9
|
+
*
|
10
|
+
* uhferret is free software: you can redistribute it and/or modify
|
11
|
+
* it under the terms of the GNU General Public License as published by
|
12
|
+
* the Free Software Foundation, either version 3 of the License, or
|
13
|
+
* (at your option) any later version.
|
14
|
+
*
|
15
|
+
* uhferret is distributed in the hope that it will be useful,
|
16
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
17
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
18
|
+
* GNU General Public License for more details.
|
19
|
+
*
|
20
|
+
* You should have received a copy of the GNU General Public License
|
21
|
+
* along with uhferret. If not, see <http://www.gnu.org/licenses/>.
|
22
|
+
*/
|
23
|
+
|
24
|
+
TupleSet::TupleSet ()
|
25
|
+
{}
|
26
|
+
|
27
|
+
void TupleSet::Clear ()
|
28
|
+
{
|
29
|
+
_tuple_map.clear ();
|
30
|
+
}
|
31
|
+
|
32
|
+
int TupleSet::Size ()
|
33
|
+
{
|
34
|
+
int trigram_count = 0;
|
35
|
+
for (Begin (); HasMore (); GetNext ())
|
36
|
+
{
|
37
|
+
trigram_count++;
|
38
|
+
}
|
39
|
+
|
40
|
+
return trigram_count;
|
41
|
+
}
|
42
|
+
|
43
|
+
std::vector<int> & TupleSet::GetDocumentsForTuple (std::size_t token_0, std::size_t token_1, std::size_t token_2)
|
44
|
+
{
|
45
|
+
return _tuple_map[token_0][token_1][token_2];
|
46
|
+
}
|
47
|
+
|
48
|
+
bool TupleSet::AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2, int document)
|
49
|
+
{
|
50
|
+
bool has_doc = false;
|
51
|
+
std::vector<int> & fvector = _tuple_map[token_0][token_1][token_2];
|
52
|
+
// check if document is already in the trigram
|
53
|
+
for (int i = 0, n = fvector.size(); i < n; ++i)
|
54
|
+
{
|
55
|
+
if (fvector[i] == document)
|
56
|
+
{
|
57
|
+
has_doc = true;
|
58
|
+
break;
|
59
|
+
}
|
60
|
+
}
|
61
|
+
|
62
|
+
if (!has_doc) // didn't have document, so add it
|
63
|
+
{
|
64
|
+
fvector.push_back (document);
|
65
|
+
return true; // indicate that document added
|
66
|
+
}
|
67
|
+
return false;
|
68
|
+
}
|
69
|
+
|
70
|
+
bool TupleSet::IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2)
|
71
|
+
{
|
72
|
+
std::vector<int> fvector = GetDocumentsForTuple (t0, t1, t2);
|
73
|
+
bool has_doc1 = false;
|
74
|
+
bool has_doc2 = false;
|
75
|
+
for (int i=0, n=fvector.size(); i<n; ++i)
|
76
|
+
{
|
77
|
+
if (fvector[i] == doc1) has_doc1 = true;
|
78
|
+
if (fvector[i] == doc2) has_doc2 = true;
|
79
|
+
}
|
80
|
+
return ( has_doc1 && has_doc2 );
|
81
|
+
}
|
82
|
+
|
83
|
+
std::vector<std::string> TupleSet::CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset)
|
84
|
+
{
|
85
|
+
std::vector<std::string> tuples;
|
86
|
+
for (Begin (); HasMore (); GetNext ())
|
87
|
+
{
|
88
|
+
if (IsMatchingTuple (GetToken (0), GetToken (1), GetToken (2),
|
89
|
+
doc1, doc2))
|
90
|
+
{
|
91
|
+
tuples.push_back (GetStringForCurrentTuple (tokenset));
|
92
|
+
}
|
93
|
+
}
|
94
|
+
return tuples;
|
95
|
+
}
|
96
|
+
|
97
|
+
void TupleSet::Begin ()
|
98
|
+
{
|
99
|
+
_ti = _tuple_map.begin ();
|
100
|
+
_pi = (_ti->second).begin ();
|
101
|
+
_wi = (_pi->second).begin ();
|
102
|
+
}
|
103
|
+
|
104
|
+
void TupleSet::GetNext ()
|
105
|
+
{
|
106
|
+
_wi++; // move to next word position
|
107
|
+
if (_wi == (_pi->second).end ()) // if words have finished, then move to next pair position
|
108
|
+
{
|
109
|
+
_pi++;
|
110
|
+
if (_pi == (_ti->second).end ()) // if pairs have finished, then move to next triple position
|
111
|
+
{
|
112
|
+
_ti++;
|
113
|
+
if (_ti == _tuple_map.end ()) return; // finished
|
114
|
+
_pi = (_ti->second).begin (); // get next pair iterator
|
115
|
+
}
|
116
|
+
_wi = (_pi->second).begin (); // get next word iterator
|
117
|
+
}
|
118
|
+
}
|
119
|
+
|
120
|
+
bool TupleSet::HasMore () const
|
121
|
+
{
|
122
|
+
return _ti != _tuple_map.end ();
|
123
|
+
}
|
124
|
+
|
125
|
+
std::vector<int> & TupleSet::GetDocumentsForCurrentTuple ()
|
126
|
+
{
|
127
|
+
return GetDocumentsForTuple (_ti->first, _pi->first, _wi->first);
|
128
|
+
}
|
129
|
+
|
130
|
+
std::string TupleSet::GetStringForCurrentTuple (TokenSet & tokenset) const
|
131
|
+
{
|
132
|
+
std::string tuple = "";
|
133
|
+
tuple += tokenset.GetStringFor (_ti->first);
|
134
|
+
tuple += " " + tokenset.GetStringFor (_pi->first);
|
135
|
+
tuple += " " + tokenset.GetStringFor (_wi->first);
|
136
|
+
|
137
|
+
return tuple;
|
138
|
+
|
139
|
+
}
|
140
|
+
|
141
|
+
std::size_t TupleSet::GetToken (int i) const
|
142
|
+
{
|
143
|
+
assert (i>=0 && i<=2);
|
144
|
+
if (i == 0)
|
145
|
+
return _ti->first;
|
146
|
+
else if (i == 1)
|
147
|
+
return _pi->first;
|
148
|
+
else // if (i == 2)
|
149
|
+
return _wi->first;
|
150
|
+
}
|
data/ext/tupleset.h
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
#if !defined tupleset_h
|
2
|
+
#define tupleset_h
|
3
|
+
|
4
|
+
/**
|
5
|
+
* This file is part of uhferret.
|
6
|
+
* Initial triple map idea by Bob Dickerson.
|
7
|
+
*
|
8
|
+
* Author:: Peter Lane
|
9
|
+
* Copyright:: Copyright 2011, Peter Lane.
|
10
|
+
* License:: GPLv3
|
11
|
+
*
|
12
|
+
* uhferret is free software: you can redistribute it and/or modify
|
13
|
+
* it under the terms of the GNU General Public License as published by
|
14
|
+
* the Free Software Foundation, either version 3 of the License, or
|
15
|
+
* (at your option) any later version.
|
16
|
+
*
|
17
|
+
* uhferret is distributed in the hope that it will be useful,
|
18
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
19
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
20
|
+
* GNU General Public License for more details.
|
21
|
+
*
|
22
|
+
* You should have received a copy of the GNU General Public License
|
23
|
+
* along with uhferret. If not, see <http://www.gnu.org/licenses/>.
|
24
|
+
*/
|
25
|
+
|
26
|
+
#include <assert.h>
|
27
|
+
#include <map>
|
28
|
+
#include <string>
|
29
|
+
#include <vector>
|
30
|
+
|
31
|
+
#include "tokenset.h"
|
32
|
+
|
33
|
+
/** TupleSet maintains the database mapping trigrams to identifier of documents which contain them.
|
34
|
+
* The mapping is held as a sequence of std::maps, each map taking a std::size_t reference to
|
35
|
+
* a token as a key. The end result of the three maps is a vector of document identifiers.
|
36
|
+
*
|
37
|
+
* The most important feature of the TupleSet is the collection of methods for iterating over
|
38
|
+
* all tuples in the TupleSet.
|
39
|
+
* e.g. with the definition: TupleSet tuple_set;
|
40
|
+
* use: for (tuple_set.Begin (); tuple_set.HasMore (); tuple_set.GetNext ())
|
41
|
+
* {}
|
42
|
+
* to iterate over all the tuples. The methods: GetDocumentsForCurrentTuple, GetStringForCurrentTuple,
|
43
|
+
* and GetToken0, GetToken1, GetToken2 return information on the current tuple.
|
44
|
+
*/
|
45
|
+
class TupleSet
|
46
|
+
{
|
47
|
+
// typedef's to simplify declarations
|
48
|
+
typedef std::map<std::size_t, std::vector<int> > WordMap;
|
49
|
+
typedef WordMap::const_iterator WordMapIter;
|
50
|
+
|
51
|
+
typedef std::map<std::size_t, WordMap> PairMap;
|
52
|
+
typedef PairMap::const_iterator PairMapIter;
|
53
|
+
|
54
|
+
typedef std::map<std::size_t, PairMap> TripMap;
|
55
|
+
typedef TripMap::const_iterator TripMapIter;
|
56
|
+
|
57
|
+
public:
|
58
|
+
TupleSet ();
|
59
|
+
void Clear ();
|
60
|
+
int Size ();
|
61
|
+
// given a tuple, return the list of documents which contain that tuple
|
62
|
+
std::vector<int> & GetDocumentsForTuple (std::size_t token_0,
|
63
|
+
std::size_t token_1, std::size_t token_2);
|
64
|
+
// given a tuple and a document identifier,
|
65
|
+
// - make sure that the document is in the list for that tuple
|
66
|
+
// - returns true if the document was not already in trigram's list
|
67
|
+
bool AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2,
|
68
|
+
int document);
|
69
|
+
// check if two documents share the given tuple
|
70
|
+
bool IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2);
|
71
|
+
// collect and return all tuples in the two given documents
|
72
|
+
std::vector<std::string> CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset);
|
73
|
+
private:
|
74
|
+
TripMap _tuple_map;
|
75
|
+
public: // following methods and data structures are to handle an iterator on tupleset
|
76
|
+
void Begin (); // start the iterator
|
77
|
+
void GetNext (); // advance the iterator
|
78
|
+
bool HasMore () const; // check for end
|
79
|
+
// retrieve current tuple's documents
|
80
|
+
std::vector<int> & GetDocumentsForCurrentTuple ();
|
81
|
+
// retrieve string for current tuple
|
82
|
+
std::string GetStringForCurrentTuple (TokenSet & tokenset) const;
|
83
|
+
// retrieve identifiers for individual tokens
|
84
|
+
std::size_t GetToken (int i) const;
|
85
|
+
private:
|
86
|
+
TripMapIter _ti; // iterator from first token to pairs
|
87
|
+
PairMapIter _pi; // iterator from second token to words
|
88
|
+
WordMapIter _wi; // iterator from third token to document list
|
89
|
+
};
|
90
|
+
|
91
|
+
#endif
|
92
|
+
|