uhferret 1.3.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,85 @@
1
+ #if !defined tokenreader_h
2
+ #define tokenreader_h
3
+
4
+ /**
5
+ * This file is part of uhferret.
6
+ *
7
+ * Author:: Peter Lane
8
+ * Copyright:: Copyright 2011, Peter Lane.
9
+ * License:: GPLv3
10
+ *
11
+ * uhferret is free software: you can redistribute it and/or modify
12
+ * it under the terms of the GNU General Public License as published by
13
+ * the Free Software Foundation, either version 3 of the License, or
14
+ * (at your option) any later version.
15
+ *
16
+ * uhferret is distributed in the hope that it will be useful,
17
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ * GNU General Public License for more details.
20
+ *
21
+ * You should have received a copy of the GNU General Public License
22
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
23
+ */
24
+
25
+ #include <ctype.h> // gives tests for if characters are numbers, alphanumerics, etc
26
+ #include <istream>
27
+
28
+ #include "tokenset.h"
29
+
30
+ /** The TokenReader is the parent class of the different 'token-isers'
31
+ * -- WordReader tokenises a document into strings of alphanumeric characters
32
+ * -- CCodeReader tokenises a document into symbols matching a C-style language
33
+ * The token reader is initialised with an input stream
34
+ * -- GetToken is used to 'walk through' the document, one token at a time
35
+ * until IsFinished returns true.
36
+ * -- the start and end points of the token can be retrieved using the given methods,
37
+ * and the string making up the token can be obtained by caller from the TokenSet
38
+ */
39
+ class TokenReader
40
+ {
41
+ public:
42
+ TokenReader (std::istream & input);
43
+ // return index of last read token
44
+ std::size_t GetToken (TokenSet & tokenset); // retrieve current token identifier
45
+ bool IsFinished () const; // return true if end-of-file reached
46
+ int GetTokenStart () const; // return the start position of current token
47
+ int GetTokenEnd () const; // return the end position of current token
48
+ // read token, return true if successful
49
+ // -- user of class must provide this method
50
+ virtual bool ReadToken () = 0;
51
+ protected: // allow subclasses to access parameters
52
+ std::istream & _input; // the stream from which to read
53
+ int _position; // current position in stream
54
+ Token _token; // last token read
55
+ int _token_start; // start position of last token read
56
+ char _look; // lookahead character
57
+ bool _done; // becomes true when stream is completed
58
+ };
59
+
60
+ // The WordReader separates its input stream into tokens, consisting of
61
+ // consecutive alphabetic characters
62
+ // -- every character is converted to lower case
63
+ class WordReader: public TokenReader
64
+ {
65
+ public:
66
+ WordReader (std::istream & input) : TokenReader (input) {}
67
+ bool IsAlphabetChar (char ch);
68
+ bool IsSingleCharWord (char ch);
69
+ bool ReadToken ();
70
+ };
71
+
72
+ // The CCodeReader separates its input stream into tokens, looking for
73
+ // C-style tokens, numbers and symbols
74
+ class CCodeReader: public TokenReader
75
+ {
76
+ public:
77
+ CCodeReader (std::istream & input) : TokenReader (input) {}
78
+ bool ReadToken ();
79
+ private:
80
+ bool IsSymbol (char c);
81
+ bool IsSymbol (std::string token, char c);
82
+ };
83
+
84
+ #endif
85
+
@@ -0,0 +1,111 @@
1
+ #include "tokenset.h"
2
+
3
+ /**
4
+ * This file is part of uhferret.
5
+ *
6
+ * Author:: Peter Lane
7
+ * Copyright:: Copyright 2011, Peter Lane.
8
+ * License:: GPLv3
9
+ *
10
+ * uhferret is free software: you can redistribute it and/or modify
11
+ * it under the terms of the GNU General Public License as published by
12
+ * the Free Software Foundation, either version 3 of the License, or
13
+ * (at your option) any later version.
14
+ *
15
+ * uhferret is distributed in the hope that it will be useful,
16
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
+ * GNU General Public License for more details.
19
+ *
20
+ * You should have received a copy of the GNU General Public License
21
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
22
+ */
23
+
24
+ Token::Token ()
25
+ : _capacity (2), _top (0)
26
+ {
27
+ _token = new char [_capacity];
28
+ }
29
+
30
+ Token::~Token ()
31
+ {
32
+ delete[] _token;
33
+ }
34
+
35
+ void Token::Erase ()
36
+ {
37
+ _top = 0;
38
+ }
39
+
40
+ void Token::AddChar (char c)
41
+ {
42
+ if (_top == _capacity)
43
+ Grow ();
44
+ _token [_top] = c;
45
+ ++_top;
46
+ }
47
+
48
+ std::string Token::GetString () const
49
+ {
50
+ return std::string (_token, _top);
51
+ }
52
+
53
+ int Token::GetLength () const
54
+ {
55
+ return _top;
56
+ }
57
+
58
+ void Token::Grow ()
59
+ {
60
+ char * newtoken = new char [2 * _capacity];
61
+ for (int i = 0; i < _capacity; ++i)
62
+ newtoken[i] = _token[i];
63
+ _capacity = 2 * _capacity;
64
+ delete _token;
65
+ _token = newtoken;
66
+ }
67
+
68
+ // *** TokenSet
69
+
70
+ TokenSet::TokenSet ()
71
+ : _nextindex (0)
72
+ {}
73
+
74
+ std::size_t TokenSet::GetIndexFor (std::string token)
75
+ {
76
+ _tokens_it = _tokens.find (token);
77
+ if (_tokens_it != _tokens.end()) // found it
78
+ return _tokens_it->second;
79
+ else // otherwise, make a new index
80
+ {
81
+ _tokens[token] = _nextindex;
82
+ _strings[_nextindex] = token;
83
+ _nextindex++;
84
+ return _nextindex-1;
85
+ }
86
+ }
87
+
88
+ std::string TokenSet::GetStringFor (std::size_t token)
89
+ {
90
+ _strings_it = _strings.find (token);
91
+ assert (_strings_it != _strings.end ()); // it's an error if token not in token set
92
+ return std::string (_strings_it->second.c_str ());
93
+ }
94
+
95
+ void TokenSet::Clear ()
96
+ {
97
+ _tokens.clear ();
98
+ _strings.clear ();
99
+ _nextindex = 0;
100
+ }
101
+
102
+ void TokenSet::SetNextIndex (int index)
103
+ {
104
+ _nextindex = index;
105
+ }
106
+
107
+ void TokenSet::SetIndexString (std::string token, int index)
108
+ {
109
+ _strings[index] = token;
110
+ _tokens[token] = index;
111
+ }
@@ -0,0 +1,73 @@
1
+ #if !defined tokenset_h
2
+ #define tokenset_h
3
+
4
+ /**
5
+ * This file is part of uhferret.
6
+ *
7
+ * Author:: Peter Lane
8
+ * Copyright:: Copyright 2011, Peter Lane.
9
+ * License:: GPLv3
10
+ *
11
+ * uhferret is free software: you can redistribute it and/or modify
12
+ * it under the terms of the GNU General Public License as published by
13
+ * the Free Software Foundation, either version 3 of the License, or
14
+ * (at your option) any later version.
15
+ *
16
+ * uhferret is distributed in the hope that it will be useful,
17
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ * GNU General Public License for more details.
20
+ *
21
+ * You should have received a copy of the GNU General Public License
22
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
23
+ */
24
+
25
+ #include <assert.h>
26
+ #include <map>
27
+ #include <string>
28
+ #include <vector>
29
+
30
+ /** A Token is a sequence of characters read in by a TokenReader
31
+ * -- this class provides a dynamic storage for the token supporting
32
+ * addition of characters
33
+ * -- when finished, the token can be queried for its length and made into a string
34
+ */
35
+ class Token
36
+ {
37
+ public:
38
+ Token ();
39
+ ~Token ();
40
+ void Erase ();
41
+ void AddChar (char c);
42
+ std::string GetString () const;
43
+ int GetLength () const;
44
+ private:
45
+ void Grow ();
46
+ char * _token; // storage for the token
47
+ int _capacity; // size of the stored token
48
+ int _top; // pointer to end of token
49
+ };
50
+
51
+ /** A TokenSet maps strings to token indices
52
+ * -- this is for memory efficiency, ensuring every token's string is
53
+ * stored once within the application
54
+ */
55
+ class TokenSet
56
+ {
57
+ public:
58
+ TokenSet ();
59
+ std::size_t GetIndexFor (std::string token);
60
+ std::string GetStringFor (std::size_t token);
61
+ void Clear ();
62
+ void SetNextIndex (int index);
63
+ void SetIndexString (std::string token, int index);
64
+ private:
65
+ std::map<std::string, std::size_t> _tokens;
66
+ std::map<std::string, std::size_t>::const_iterator _tokens_it;
67
+ std::size_t _nextindex; // next free index for new string
68
+ std::map<std::size_t, std::string> _strings;
69
+ std::map<std::size_t, std::string>::const_iterator _strings_it;
70
+ };
71
+
72
+ #endif
73
+
@@ -0,0 +1,150 @@
1
+ #include "tupleset.h"
2
+
3
+ /**
4
+ * This file is part of uhferret.
5
+ *
6
+ * Author:: Peter Lane
7
+ * Copyright:: Copyright 2011, Peter Lane.
8
+ * License:: GPLv3
9
+ *
10
+ * uhferret is free software: you can redistribute it and/or modify
11
+ * it under the terms of the GNU General Public License as published by
12
+ * the Free Software Foundation, either version 3 of the License, or
13
+ * (at your option) any later version.
14
+ *
15
+ * uhferret is distributed in the hope that it will be useful,
16
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
+ * GNU General Public License for more details.
19
+ *
20
+ * You should have received a copy of the GNU General Public License
21
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
22
+ */
23
+
24
+ TupleSet::TupleSet ()
25
+ {}
26
+
27
+ void TupleSet::Clear ()
28
+ {
29
+ _tuple_map.clear ();
30
+ }
31
+
32
+ int TupleSet::Size ()
33
+ {
34
+ int trigram_count = 0;
35
+ for (Begin (); HasMore (); GetNext ())
36
+ {
37
+ trigram_count++;
38
+ }
39
+
40
+ return trigram_count;
41
+ }
42
+
43
+ std::vector<int> & TupleSet::GetDocumentsForTuple (std::size_t token_0, std::size_t token_1, std::size_t token_2)
44
+ {
45
+ return _tuple_map[token_0][token_1][token_2];
46
+ }
47
+
48
+ bool TupleSet::AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2, int document)
49
+ {
50
+ bool has_doc = false;
51
+ std::vector<int> & fvector = _tuple_map[token_0][token_1][token_2];
52
+ // check if document is already in the trigram
53
+ for (int i = 0, n = fvector.size(); i < n; ++i)
54
+ {
55
+ if (fvector[i] == document)
56
+ {
57
+ has_doc = true;
58
+ break;
59
+ }
60
+ }
61
+
62
+ if (!has_doc) // didn't have document, so add it
63
+ {
64
+ fvector.push_back (document);
65
+ return true; // indicate that document added
66
+ }
67
+ return false;
68
+ }
69
+
70
+ bool TupleSet::IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2)
71
+ {
72
+ std::vector<int> fvector = GetDocumentsForTuple (t0, t1, t2);
73
+ bool has_doc1 = false;
74
+ bool has_doc2 = false;
75
+ for (int i=0, n=fvector.size(); i<n; ++i)
76
+ {
77
+ if (fvector[i] == doc1) has_doc1 = true;
78
+ if (fvector[i] == doc2) has_doc2 = true;
79
+ }
80
+ return ( has_doc1 && has_doc2 );
81
+ }
82
+
83
+ std::vector<std::string> TupleSet::CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset)
84
+ {
85
+ std::vector<std::string> tuples;
86
+ for (Begin (); HasMore (); GetNext ())
87
+ {
88
+ if (IsMatchingTuple (GetToken (0), GetToken (1), GetToken (2),
89
+ doc1, doc2))
90
+ {
91
+ tuples.push_back (GetStringForCurrentTuple (tokenset));
92
+ }
93
+ }
94
+ return tuples;
95
+ }
96
+
97
+ void TupleSet::Begin ()
98
+ {
99
+ _ti = _tuple_map.begin ();
100
+ _pi = (_ti->second).begin ();
101
+ _wi = (_pi->second).begin ();
102
+ }
103
+
104
+ void TupleSet::GetNext ()
105
+ {
106
+ _wi++; // move to next word position
107
+ if (_wi == (_pi->second).end ()) // if words have finished, then move to next pair position
108
+ {
109
+ _pi++;
110
+ if (_pi == (_ti->second).end ()) // if pairs have finished, then move to next triple position
111
+ {
112
+ _ti++;
113
+ if (_ti == _tuple_map.end ()) return; // finished
114
+ _pi = (_ti->second).begin (); // get next pair iterator
115
+ }
116
+ _wi = (_pi->second).begin (); // get next word iterator
117
+ }
118
+ }
119
+
120
+ bool TupleSet::HasMore () const
121
+ {
122
+ return _ti != _tuple_map.end ();
123
+ }
124
+
125
+ std::vector<int> & TupleSet::GetDocumentsForCurrentTuple ()
126
+ {
127
+ return GetDocumentsForTuple (_ti->first, _pi->first, _wi->first);
128
+ }
129
+
130
+ std::string TupleSet::GetStringForCurrentTuple (TokenSet & tokenset) const
131
+ {
132
+ std::string tuple = "";
133
+ tuple += tokenset.GetStringFor (_ti->first);
134
+ tuple += " " + tokenset.GetStringFor (_pi->first);
135
+ tuple += " " + tokenset.GetStringFor (_wi->first);
136
+
137
+ return tuple;
138
+
139
+ }
140
+
141
+ std::size_t TupleSet::GetToken (int i) const
142
+ {
143
+ assert (i>=0 && i<=2);
144
+ if (i == 0)
145
+ return _ti->first;
146
+ else if (i == 1)
147
+ return _pi->first;
148
+ else // if (i == 2)
149
+ return _wi->first;
150
+ }
@@ -0,0 +1,92 @@
1
+ #if !defined tupleset_h
2
+ #define tupleset_h
3
+
4
+ /**
5
+ * This file is part of uhferret.
6
+ * Initial triple map idea by Bob Dickerson.
7
+ *
8
+ * Author:: Peter Lane
9
+ * Copyright:: Copyright 2011, Peter Lane.
10
+ * License:: GPLv3
11
+ *
12
+ * uhferret is free software: you can redistribute it and/or modify
13
+ * it under the terms of the GNU General Public License as published by
14
+ * the Free Software Foundation, either version 3 of the License, or
15
+ * (at your option) any later version.
16
+ *
17
+ * uhferret is distributed in the hope that it will be useful,
18
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20
+ * GNU General Public License for more details.
21
+ *
22
+ * You should have received a copy of the GNU General Public License
23
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
24
+ */
25
+
26
+ #include <assert.h>
27
+ #include <map>
28
+ #include <string>
29
+ #include <vector>
30
+
31
+ #include "tokenset.h"
32
+
33
+ /** TupleSet maintains the database mapping trigrams to identifier of documents which contain them.
34
+ * The mapping is held as a sequence of std::maps, each map taking a std::size_t reference to
35
+ * a token as a key. The end result of the three maps is a vector of document identifiers.
36
+ *
37
+ * The most important feature of the TupleSet is the collection of methods for iterating over
38
+ * all tuples in the TupleSet.
39
+ * e.g. with the definition: TupleSet tuple_set;
40
+ * use: for (tuple_set.Begin (); tuple_set.HasMore (); tuple_set.GetNext ())
41
+ * {}
42
+ * to iterate over all the tuples. The methods: GetDocumentsForCurrentTuple, GetStringForCurrentTuple,
43
+ * and GetToken0, GetToken1, GetToken2 return information on the current tuple.
44
+ */
45
+ class TupleSet
46
+ {
47
+ // typedef's to simplify declarations
48
+ typedef std::map<std::size_t, std::vector<int> > WordMap;
49
+ typedef WordMap::const_iterator WordMapIter;
50
+
51
+ typedef std::map<std::size_t, WordMap> PairMap;
52
+ typedef PairMap::const_iterator PairMapIter;
53
+
54
+ typedef std::map<std::size_t, PairMap> TripMap;
55
+ typedef TripMap::const_iterator TripMapIter;
56
+
57
+ public:
58
+ TupleSet ();
59
+ void Clear ();
60
+ int Size ();
61
+ // given a tuple, return the list of documents which contain that tuple
62
+ std::vector<int> & GetDocumentsForTuple (std::size_t token_0,
63
+ std::size_t token_1, std::size_t token_2);
64
+ // given a tuple and a document identifier,
65
+ // - make sure that the document is in the list for that tuple
66
+ // - returns true if the document was not already in trigram's list
67
+ bool AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2,
68
+ int document);
69
+ // check if two documents share the given tuple
70
+ bool IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2);
71
+ // collect and return all tuples in the two given documents
72
+ std::vector<std::string> CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset);
73
+ private:
74
+ TripMap _tuple_map;
75
+ public: // following methods and data structures are to handle an iterator on tupleset
76
+ void Begin (); // start the iterator
77
+ void GetNext (); // advance the iterator
78
+ bool HasMore () const; // check for end
79
+ // retrieve current tuple's documents
80
+ std::vector<int> & GetDocumentsForCurrentTuple ();
81
+ // retrieve string for current tuple
82
+ std::string GetStringForCurrentTuple (TokenSet & tokenset) const;
83
+ // retrieve identifiers for individual tokens
84
+ std::size_t GetToken (int i) const;
85
+ private:
86
+ TripMapIter _ti; // iterator from first token to pairs
87
+ PairMapIter _pi; // iterator from second token to words
88
+ WordMapIter _wi; // iterator from third token to document list
89
+ };
90
+
91
+ #endif
92
+