uhferret 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,85 @@
1
+ #if !defined tokenreader_h
2
+ #define tokenreader_h
3
+
4
+ /**
5
+ * This file is part of uhferret.
6
+ *
7
+ * Author:: Peter Lane
8
+ * Copyright:: Copyright 2011, Peter Lane.
9
+ * License:: GPLv3
10
+ *
11
+ * uhferret is free software: you can redistribute it and/or modify
12
+ * it under the terms of the GNU General Public License as published by
13
+ * the Free Software Foundation, either version 3 of the License, or
14
+ * (at your option) any later version.
15
+ *
16
+ * uhferret is distributed in the hope that it will be useful,
17
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ * GNU General Public License for more details.
20
+ *
21
+ * You should have received a copy of the GNU General Public License
22
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
23
+ */
24
+
25
+ #include <ctype.h> // gives tests for if characters are numbers, alphanumerics, etc
26
+ #include <istream>
27
+
28
+ #include "tokenset.h"
29
+
30
+ /** The TokenReader is the parent class of the different 'token-isers'
31
+ * -- WordReader tokenises a document into strings of alphanumeric characters
32
+ * -- CCodeReader tokenises a document into symbols matching a C-style language
33
+ * The token reader is initialised with an input stream
34
+ * -- GetToken is used to 'walk through' the document, one token at a time
35
+ * until IsFinished returns true.
36
+ * -- the start and end points of the token can be retrieved using the given methods,
37
+ * and the string making up the token can be obtained by caller from the TokenSet
38
+ */
39
+ class TokenReader
40
+ {
41
+ public:
42
+ TokenReader (std::istream & input);
43
+ // return index of last read token
44
+ std::size_t GetToken (TokenSet & tokenset); // retrieve current token identifier
45
+ bool IsFinished () const; // return true if end-of-file reached
46
+ int GetTokenStart () const; // return the start position of current token
47
+ int GetTokenEnd () const; // return the end position of current token
48
+ // read token, return true if successful
49
+ // -- user of class must provide this method
50
+ virtual bool ReadToken () = 0;
51
+ protected: // allow subclasses to access parameters
52
+ std::istream & _input; // the stream from which to read
53
+ int _position; // current position in stream
54
+ Token _token; // last token read
55
+ int _token_start; // start position of last token read
56
+ char _look; // lookahead character
57
+ bool _done; // becomes true when stream is completed
58
+ };
59
+
60
+ // The WordReader separates its input stream into tokens, consisting of
61
+ // consecutive alphabetic characters
62
+ // -- every character is converted to lower case
63
+ class WordReader: public TokenReader
64
+ {
65
+ public:
66
+ WordReader (std::istream & input) : TokenReader (input) {}
67
+ bool IsAlphabetChar (char ch);
68
+ bool IsSingleCharWord (char ch);
69
+ bool ReadToken ();
70
+ };
71
+
72
+ // The CCodeReader separates its input stream into tokens, looking for
73
+ // C-style tokens, numbers and symbols
74
+ class CCodeReader: public TokenReader
75
+ {
76
+ public:
77
+ CCodeReader (std::istream & input) : TokenReader (input) {}
78
+ bool ReadToken ();
79
+ private:
80
+ bool IsSymbol (char c);
81
+ bool IsSymbol (std::string token, char c);
82
+ };
83
+
84
+ #endif
85
+
@@ -0,0 +1,111 @@
1
+ #include "tokenset.h"
2
+
3
+ /**
4
+ * This file is part of uhferret.
5
+ *
6
+ * Author:: Peter Lane
7
+ * Copyright:: Copyright 2011, Peter Lane.
8
+ * License:: GPLv3
9
+ *
10
+ * uhferret is free software: you can redistribute it and/or modify
11
+ * it under the terms of the GNU General Public License as published by
12
+ * the Free Software Foundation, either version 3 of the License, or
13
+ * (at your option) any later version.
14
+ *
15
+ * uhferret is distributed in the hope that it will be useful,
16
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
+ * GNU General Public License for more details.
19
+ *
20
+ * You should have received a copy of the GNU General Public License
21
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
22
+ */
23
+
24
+ Token::Token ()
25
+ : _capacity (2), _top (0)
26
+ {
27
+ _token = new char [_capacity];
28
+ }
29
+
30
+ Token::~Token ()
31
+ {
32
+ delete[] _token;
33
+ }
34
+
35
+ void Token::Erase ()
36
+ {
37
+ _top = 0;
38
+ }
39
+
40
+ void Token::AddChar (char c)
41
+ {
42
+ if (_top == _capacity)
43
+ Grow ();
44
+ _token [_top] = c;
45
+ ++_top;
46
+ }
47
+
48
+ std::string Token::GetString () const
49
+ {
50
+ return std::string (_token, _top);
51
+ }
52
+
53
+ int Token::GetLength () const
54
+ {
55
+ return _top;
56
+ }
57
+
58
+ void Token::Grow ()
59
+ {
60
+ char * newtoken = new char [2 * _capacity];
61
+ for (int i = 0; i < _capacity; ++i)
62
+ newtoken[i] = _token[i];
63
+ _capacity = 2 * _capacity;
64
+ delete _token;
65
+ _token = newtoken;
66
+ }
67
+
68
+ // *** TokenSet
69
+
70
+ TokenSet::TokenSet ()
71
+ : _nextindex (0)
72
+ {}
73
+
74
+ std::size_t TokenSet::GetIndexFor (std::string token)
75
+ {
76
+ _tokens_it = _tokens.find (token);
77
+ if (_tokens_it != _tokens.end()) // found it
78
+ return _tokens_it->second;
79
+ else // otherwise, make a new index
80
+ {
81
+ _tokens[token] = _nextindex;
82
+ _strings[_nextindex] = token;
83
+ _nextindex++;
84
+ return _nextindex-1;
85
+ }
86
+ }
87
+
88
+ std::string TokenSet::GetStringFor (std::size_t token)
89
+ {
90
+ _strings_it = _strings.find (token);
91
+ assert (_strings_it != _strings.end ()); // it's an error if token not in token set
92
+ return std::string (_strings_it->second.c_str ());
93
+ }
94
+
95
+ void TokenSet::Clear ()
96
+ {
97
+ _tokens.clear ();
98
+ _strings.clear ();
99
+ _nextindex = 0;
100
+ }
101
+
102
+ void TokenSet::SetNextIndex (int index)
103
+ {
104
+ _nextindex = index;
105
+ }
106
+
107
+ void TokenSet::SetIndexString (std::string token, int index)
108
+ {
109
+ _strings[index] = token;
110
+ _tokens[token] = index;
111
+ }
@@ -0,0 +1,73 @@
1
+ #if !defined tokenset_h
2
+ #define tokenset_h
3
+
4
+ /**
5
+ * This file is part of uhferret.
6
+ *
7
+ * Author:: Peter Lane
8
+ * Copyright:: Copyright 2011, Peter Lane.
9
+ * License:: GPLv3
10
+ *
11
+ * uhferret is free software: you can redistribute it and/or modify
12
+ * it under the terms of the GNU General Public License as published by
13
+ * the Free Software Foundation, either version 3 of the License, or
14
+ * (at your option) any later version.
15
+ *
16
+ * uhferret is distributed in the hope that it will be useful,
17
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ * GNU General Public License for more details.
20
+ *
21
+ * You should have received a copy of the GNU General Public License
22
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
23
+ */
24
+
25
+ #include <assert.h>
26
+ #include <map>
27
+ #include <string>
28
+ #include <vector>
29
+
30
+ /** A Token is a sequence of characters read in by a TokenReader
31
+ * -- this class provides a dynamic storage for the token supporting
32
+ * addition of characters
33
+ * -- when finished, the token can be queried for its length and made into a string
34
+ */
35
+ class Token
36
+ {
37
+ public:
38
+ Token ();
39
+ ~Token ();
40
+ void Erase ();
41
+ void AddChar (char c);
42
+ std::string GetString () const;
43
+ int GetLength () const;
44
+ private:
45
+ void Grow ();
46
+ char * _token; // storage for the token
47
+ int _capacity; // size of the stored token
48
+ int _top; // pointer to end of token
49
+ };
50
+
51
+ /** A TokenSet maps strings to token indices
52
+ * -- this is for memory efficiency, ensuring every token's string is
53
+ * stored once within the application
54
+ */
55
+ class TokenSet
56
+ {
57
+ public:
58
+ TokenSet ();
59
+ std::size_t GetIndexFor (std::string token);
60
+ std::string GetStringFor (std::size_t token);
61
+ void Clear ();
62
+ void SetNextIndex (int index);
63
+ void SetIndexString (std::string token, int index);
64
+ private:
65
+ std::map<std::string, std::size_t> _tokens;
66
+ std::map<std::string, std::size_t>::const_iterator _tokens_it;
67
+ std::size_t _nextindex; // next free index for new string
68
+ std::map<std::size_t, std::string> _strings;
69
+ std::map<std::size_t, std::string>::const_iterator _strings_it;
70
+ };
71
+
72
+ #endif
73
+
@@ -0,0 +1,150 @@
1
+ #include "tupleset.h"
2
+
3
+ /**
4
+ * This file is part of uhferret.
5
+ *
6
+ * Author:: Peter Lane
7
+ * Copyright:: Copyright 2011, Peter Lane.
8
+ * License:: GPLv3
9
+ *
10
+ * uhferret is free software: you can redistribute it and/or modify
11
+ * it under the terms of the GNU General Public License as published by
12
+ * the Free Software Foundation, either version 3 of the License, or
13
+ * (at your option) any later version.
14
+ *
15
+ * uhferret is distributed in the hope that it will be useful,
16
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
+ * GNU General Public License for more details.
19
+ *
20
+ * You should have received a copy of the GNU General Public License
21
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
22
+ */
23
+
24
+ TupleSet::TupleSet ()
25
+ {}
26
+
27
+ void TupleSet::Clear ()
28
+ {
29
+ _tuple_map.clear ();
30
+ }
31
+
32
+ int TupleSet::Size ()
33
+ {
34
+ int trigram_count = 0;
35
+ for (Begin (); HasMore (); GetNext ())
36
+ {
37
+ trigram_count++;
38
+ }
39
+
40
+ return trigram_count;
41
+ }
42
+
43
+ std::vector<int> & TupleSet::GetDocumentsForTuple (std::size_t token_0, std::size_t token_1, std::size_t token_2)
44
+ {
45
+ return _tuple_map[token_0][token_1][token_2];
46
+ }
47
+
48
+ bool TupleSet::AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2, int document)
49
+ {
50
+ bool has_doc = false;
51
+ std::vector<int> & fvector = _tuple_map[token_0][token_1][token_2];
52
+ // check if document is already in the trigram
53
+ for (int i = 0, n = fvector.size(); i < n; ++i)
54
+ {
55
+ if (fvector[i] == document)
56
+ {
57
+ has_doc = true;
58
+ break;
59
+ }
60
+ }
61
+
62
+ if (!has_doc) // didn't have document, so add it
63
+ {
64
+ fvector.push_back (document);
65
+ return true; // indicate that document added
66
+ }
67
+ return false;
68
+ }
69
+
70
+ bool TupleSet::IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2)
71
+ {
72
+ std::vector<int> fvector = GetDocumentsForTuple (t0, t1, t2);
73
+ bool has_doc1 = false;
74
+ bool has_doc2 = false;
75
+ for (int i=0, n=fvector.size(); i<n; ++i)
76
+ {
77
+ if (fvector[i] == doc1) has_doc1 = true;
78
+ if (fvector[i] == doc2) has_doc2 = true;
79
+ }
80
+ return ( has_doc1 && has_doc2 );
81
+ }
82
+
83
+ std::vector<std::string> TupleSet::CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset)
84
+ {
85
+ std::vector<std::string> tuples;
86
+ for (Begin (); HasMore (); GetNext ())
87
+ {
88
+ if (IsMatchingTuple (GetToken (0), GetToken (1), GetToken (2),
89
+ doc1, doc2))
90
+ {
91
+ tuples.push_back (GetStringForCurrentTuple (tokenset));
92
+ }
93
+ }
94
+ return tuples;
95
+ }
96
+
97
+ void TupleSet::Begin ()
98
+ {
99
+ _ti = _tuple_map.begin ();
100
+ _pi = (_ti->second).begin ();
101
+ _wi = (_pi->second).begin ();
102
+ }
103
+
104
+ void TupleSet::GetNext ()
105
+ {
106
+ _wi++; // move to next word position
107
+ if (_wi == (_pi->second).end ()) // if words have finished, then move to next pair position
108
+ {
109
+ _pi++;
110
+ if (_pi == (_ti->second).end ()) // if pairs have finished, then move to next triple position
111
+ {
112
+ _ti++;
113
+ if (_ti == _tuple_map.end ()) return; // finished
114
+ _pi = (_ti->second).begin (); // get next pair iterator
115
+ }
116
+ _wi = (_pi->second).begin (); // get next word iterator
117
+ }
118
+ }
119
+
120
+ bool TupleSet::HasMore () const
121
+ {
122
+ return _ti != _tuple_map.end ();
123
+ }
124
+
125
+ std::vector<int> & TupleSet::GetDocumentsForCurrentTuple ()
126
+ {
127
+ return GetDocumentsForTuple (_ti->first, _pi->first, _wi->first);
128
+ }
129
+
130
+ std::string TupleSet::GetStringForCurrentTuple (TokenSet & tokenset) const
131
+ {
132
+ std::string tuple = "";
133
+ tuple += tokenset.GetStringFor (_ti->first);
134
+ tuple += " " + tokenset.GetStringFor (_pi->first);
135
+ tuple += " " + tokenset.GetStringFor (_wi->first);
136
+
137
+ return tuple;
138
+
139
+ }
140
+
141
+ std::size_t TupleSet::GetToken (int i) const
142
+ {
143
+ assert (i>=0 && i<=2);
144
+ if (i == 0)
145
+ return _ti->first;
146
+ else if (i == 1)
147
+ return _pi->first;
148
+ else // if (i == 2)
149
+ return _wi->first;
150
+ }
@@ -0,0 +1,92 @@
1
+ #if !defined tupleset_h
2
+ #define tupleset_h
3
+
4
+ /**
5
+ * This file is part of uhferret.
6
+ * Initial triple map idea by Bob Dickerson.
7
+ *
8
+ * Author:: Peter Lane
9
+ * Copyright:: Copyright 2011, Peter Lane.
10
+ * License:: GPLv3
11
+ *
12
+ * uhferret is free software: you can redistribute it and/or modify
13
+ * it under the terms of the GNU General Public License as published by
14
+ * the Free Software Foundation, either version 3 of the License, or
15
+ * (at your option) any later version.
16
+ *
17
+ * uhferret is distributed in the hope that it will be useful,
18
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20
+ * GNU General Public License for more details.
21
+ *
22
+ * You should have received a copy of the GNU General Public License
23
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
24
+ */
25
+
26
+ #include <assert.h>
27
+ #include <map>
28
+ #include <string>
29
+ #include <vector>
30
+
31
+ #include "tokenset.h"
32
+
33
+ /** TupleSet maintains the database mapping trigrams to identifier of documents which contain them.
34
+ * The mapping is held as a sequence of std::maps, each map taking a std::size_t reference to
35
+ * a token as a key. The end result of the three maps is a vector of document identifiers.
36
+ *
37
+ * The most important feature of the TupleSet is the collection of methods for iterating over
38
+ * all tuples in the TupleSet.
39
+ * e.g. with the definition: TupleSet tuple_set;
40
+ * use: for (tuple_set.Begin (); tuple_set.HasMore (); tuple_set.GetNext ())
41
+ * {}
42
+ * to iterate over all the tuples. The methods: GetDocumentsForCurrentTuple, GetStringForCurrentTuple,
43
+ * and GetToken0, GetToken1, GetToken2 return information on the current tuple.
44
+ */
45
+ class TupleSet
46
+ {
47
+ // typedef's to simplify declarations
48
+ typedef std::map<std::size_t, std::vector<int> > WordMap;
49
+ typedef WordMap::const_iterator WordMapIter;
50
+
51
+ typedef std::map<std::size_t, WordMap> PairMap;
52
+ typedef PairMap::const_iterator PairMapIter;
53
+
54
+ typedef std::map<std::size_t, PairMap> TripMap;
55
+ typedef TripMap::const_iterator TripMapIter;
56
+
57
+ public:
58
+ TupleSet ();
59
+ void Clear ();
60
+ int Size ();
61
+ // given a tuple, return the list of documents which contain that tuple
62
+ std::vector<int> & GetDocumentsForTuple (std::size_t token_0,
63
+ std::size_t token_1, std::size_t token_2);
64
+ // given a tuple and a document identifier,
65
+ // - make sure that the document is in the list for that tuple
66
+ // - returns true if the document was not already in trigram's list
67
+ bool AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2,
68
+ int document);
69
+ // check if two documents share the given tuple
70
+ bool IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2);
71
+ // collect and return all tuples in the two given documents
72
+ std::vector<std::string> CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset);
73
+ private:
74
+ TripMap _tuple_map;
75
+ public: // following methods and data structures are to handle an iterator on tupleset
76
+ void Begin (); // start the iterator
77
+ void GetNext (); // advance the iterator
78
+ bool HasMore () const; // check for end
79
+ // retrieve current tuple's documents
80
+ std::vector<int> & GetDocumentsForCurrentTuple ();
81
+ // retrieve string for current tuple
82
+ std::string GetStringForCurrentTuple (TokenSet & tokenset) const;
83
+ // retrieve identifiers for individual tokens
84
+ std::size_t GetToken (int i) const;
85
+ private:
86
+ TripMapIter _ti; // iterator from first token to pairs
87
+ PairMapIter _pi; // iterator from second token to words
88
+ WordMapIter _wi; // iterator from third token to document list
89
+ };
90
+
91
+ #endif
92
+