uhferret 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,89 @@
1
+ #if !defined document_h
2
+ #define document_h
3
+
4
+ /**
5
+ * This file is part of uhferret.
6
+ *
7
+ * Author:: Peter Lane
8
+ * Copyright:: Copyright 2011, Peter Lane.
9
+ * License:: GPLv3
10
+ *
11
+ * uhferret is free software: you can redistribute it and/or modify
12
+ * it under the terms of the GNU General Public License as published by
13
+ * the Free Software Foundation, either version 3 of the License, or
14
+ * (at your option) any later version.
15
+ *
16
+ * uhferret is distributed in the hope that it will be useful,
17
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ * GNU General Public License for more details.
20
+ *
21
+ * You should have received a copy of the GNU General Public License
22
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
23
+ */
24
+
25
+ #include <cctype>
26
+ #include <fstream>
27
+ #include <istream>
28
+ #include <string>
29
+
30
+ #include "tokenset.h"
31
+ #include "tokenreader.h"
32
+
33
+ /** Document points to a document on the local filestore.
34
+ * -- each Document is initialised with a pathname and the type of a document
35
+ * or it may take these values from a given Document
36
+ * -- the group_id is used to place documents into groups: documents with the same id
37
+ * will not be compared against each other
38
+ * -- Document owns a TokenReader, which is created on heap during initialisation
39
+ * -- the important part of the class is the set of methods for iterating
40
+ * across the trigrams, using ReadTrigram, GetTrigramStart/End and GetToken
41
+ */
42
+ class Document
43
+ {
44
+ public:
45
+ enum DocumentType { TypeText, TypeCode };
46
+ Document (std::string pathname, DocumentType type = TypeText, int id = 0);
47
+ Document (Document * document);
48
+ // basic information about document: its type and names
49
+ void SetType (DocumentType type);
50
+ bool IsTextType () const;
51
+ std::string GetPathname () const;
52
+ void SetPathname (std::string pathname);
53
+ // accessor/setter for group_id
54
+ int GetGroupId () const;
55
+ void SetGroupId (int id);
56
+ // information about trigrams in document
57
+ int GetTrigramCount () const;
58
+ void SetTrigramCount (int count);
59
+ void ResetTrigramCount ();
60
+ void IncrementTrigramCount ();
61
+ // following methods used to start, read and end processing of trigrams
62
+ bool StartInput (TokenSet & tokenset);
63
+ bool StartInput (std::istream & input, TokenSet & tokenset);
64
+ bool ReadTrigram (TokenSet & tokenset);
65
+ std::size_t GetToken (int i) const; // access token of current trigram
66
+ std::size_t GetTrigramStart () const; // access start position of trigram
67
+ std::size_t GetTrigramStart (int i) const; // access start of token i in trigram
68
+ std::size_t GetTrigramEnd () const; // access end position of trigram
69
+ void CloseInput ();
70
+ // following methods check the type of the document based on its filename
71
+ bool IsCodeType () const;
72
+ bool IsTxtType () const;
73
+ bool IsUnknownType () const;
74
+ private:
75
+ bool IsFileType (std::string extension) const;
76
+ std::string StringToUpper (std::string) const;
77
+ void InitialiseInput (TokenSet & tokenset);
78
+ std::string _pathname; // -- source for this document
79
+ DocumentType _type;
80
+ int _num_trigrams;
81
+ std::ifstream * _fb;
82
+ std::istream * _cin;
83
+ TokenReader * _token_input; // this is a pointer, because initialised separately
84
+ std::size_t _current_tuple[3];
85
+ std::size_t _current_start[3];
86
+ int _group_id; // an index number indicating this document's group
87
+ };
88
+
89
+ #endif
@@ -0,0 +1,229 @@
1
+ #include "documentlist.h"
2
+
3
+ /**
4
+ * This file is part of uhferret.
5
+ *
6
+ * Author:: Peter Lane
7
+ * Copyright:: Copyright 2011, Peter Lane.
8
+ * License:: GPLv3
9
+ *
10
+ * uhferret is free software: you can redistribute it and/or modify
11
+ * it under the terms of the GNU General Public License as published by
12
+ * the Free Software Foundation, either version 3 of the License, or
13
+ * (at your option) any later version.
14
+ *
15
+ * uhferret is distributed in the hope that it will be useful,
16
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
+ * GNU General Public License for more details.
19
+ *
20
+ * You should have received a copy of the GNU General Public License
21
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
22
+ */
23
+
24
+ DocumentList::DocumentList () :
25
+ _last_group_id (0)
26
+ {}
27
+
28
+ DocumentList::~DocumentList ()
29
+ {
30
+ Clear ();
31
+ }
32
+
33
+ void DocumentList::AddDocument (std::string pathname, Document::DocumentType type)
34
+ {
35
+ _documents.push_back (new Document (pathname, type, GetNewGroupId ()));
36
+ }
37
+
38
+ void DocumentList::AddDocument (std::string pathname, Document::DocumentType type, int id)
39
+ {
40
+ _documents.push_back (new Document (pathname, type, id));
41
+ }
42
+
43
+ Document * DocumentList::getDocument (std::size_t i) const
44
+ {
45
+ assert (i >= 0 && i < _documents.size ());
46
+ return _documents[i];
47
+ }
48
+
49
+ void DocumentList::RemoveDocument (Document * doc)
50
+ {
51
+ for (std::vector<Document *>::iterator it = _documents.begin();
52
+ it != _documents.end();
53
+ ++it)
54
+ {
55
+ if (*it == doc)
56
+ {
57
+ _documents.erase (it);
58
+ return;
59
+ }
60
+ }
61
+ }
62
+
63
+ TokenSet & DocumentList::GetTokenSet ()
64
+ {
65
+ return _token_set;
66
+ }
67
+
68
+ TupleSet & DocumentList::GetTupleSet ()
69
+ {
70
+ return _tuple_set;
71
+ }
72
+
73
+ // A Document list owns the documents, so is responsible for deleting them
74
+ void DocumentList::Clear ()
75
+ {
76
+ for (int i=0, n=_documents.size(); i<n; ++i)
77
+ {
78
+ delete _documents[i];
79
+ }
80
+ _documents.clear ();
81
+ ResetReading ();
82
+ }
83
+
84
+ // return a new, unique group id.
85
+ // note that id = 0 has special meaning (files downloaded from web)
86
+ int DocumentList::GetNewGroupId ()
87
+ {
88
+ _last_group_id += 1;
89
+ return _last_group_id;
90
+ }
91
+
92
+ void DocumentList::ResetReading ()
93
+ {
94
+ _token_set.Clear ();
95
+ _tuple_set.Clear ();
96
+ _matches.clear ();
97
+ }
98
+
99
+ int DocumentList::Size () const
100
+ {
101
+ return _documents.size ();
102
+ }
103
+
104
+ // don't count pairs of documents in same group
105
+ int DocumentList::NumberOfPairs () const
106
+ {
107
+ int num_pairs = 0;
108
+ for (int i = 0; i < _documents.size (); ++i)
109
+ for (int j = i+1; j < _documents.size (); ++j)
110
+ {
111
+ if (_documents[i]->GetGroupId () != _documents[j]->GetGroupId ())
112
+ num_pairs++;
113
+ }
114
+ return num_pairs;
115
+ }
116
+
117
+ void DocumentList::RunFerret (int first_document)
118
+ {
119
+ ResetReading ();
120
+ // phase 1 -- read each file in turn, finding trigrams
121
+ for (int i = first_document; i < _documents.size (); ++i)
122
+ {
123
+ ReadDocument (i);
124
+ }
125
+
126
+ // phase 2 -- compute the similarities
127
+ ComputeSimilarities ();
128
+ }
129
+
130
+ void DocumentList::ReadDocument (int i)
131
+ {
132
+ _documents[i]->StartInput (_token_set);
133
+ _documents[i]->ResetTrigramCount ();
134
+ while ( _documents[i]->ReadTrigram (_token_set) )
135
+ {
136
+ if (_tuple_set.AddDocument (
137
+ _documents[i]->GetToken (0),
138
+ _documents[i]->GetToken (1),
139
+ _documents[i]->GetToken (2),
140
+ i))
141
+ {
142
+ _documents[i]->IncrementTrigramCount ();
143
+ }
144
+ }
145
+ _documents[i]->CloseInput ();
146
+ }
147
+
148
+ void DocumentList::ClearSimilarities ()
149
+ {
150
+ for (int i=0; i < _documents.size() * _documents.size(); ++i)
151
+ {
152
+ _matches.push_back (0);
153
+ }
154
+ }
155
+
156
+ void DocumentList::ComputeSimilarities ()
157
+ {
158
+ ClearSimilarities ();
159
+ for (_tuple_set.Begin (); _tuple_set.HasMore (); _tuple_set.GetNext ())
160
+ {
161
+ const std::vector<int> & fvector = _tuple_set.GetDocumentsForCurrentTuple ();
162
+ // take each pair of documents in the vector, and add one to matches
163
+ for (unsigned int fi = 0, n = fvector.size (); fi < n; ++fi)
164
+ {
165
+ for (unsigned int fj=fi+1; fj < n; ++fj)
166
+ {
167
+ // ensure that first index is smaller than the second
168
+ int doc1 = fvector[(fi <= fj ? fi : fj)];
169
+ int doc2 = fvector[(fi <= fj ? fj : fi)];
170
+ assert (doc1 * _documents.size() + doc2 < _matches.size());
171
+ _matches[doc1 * _documents.size() + doc2]++;
172
+ }
173
+ }
174
+ }
175
+ }
176
+
177
+ int DocumentList::GetTotalTrigramCount ()
178
+ {
179
+ return _tuple_set.Size ();
180
+ }
181
+
182
+ int DocumentList::CountTrigrams (int doc_i)
183
+ {
184
+ return _documents[doc_i]->GetTrigramCount ();
185
+ }
186
+
187
+ int DocumentList::CountMatches (int doc_i, int doc_j)
188
+ {
189
+ assert (doc_j > doc_i); // _matches is only completed from one side, with doc_j > doc_i
190
+ assert ((doc_i * _documents.size() + doc_j) < _matches.size());
191
+ return _matches[doc_i * _documents.size() + doc_j];
192
+ }
193
+
194
+ float DocumentList::ComputeResemblance (int doc_i, int doc_j)
195
+ {
196
+ float num_matches = (float)CountMatches (doc_i, doc_j);
197
+ float total_trigrams = (float)(CountTrigrams (doc_i) + CountTrigrams (doc_j) - CountMatches (doc_i, doc_j));
198
+ if (total_trigrams == 0.0) return 0.0; // check for divide by zero
199
+ return num_matches/total_trigrams;
200
+ }
201
+
202
+ float DocumentList::ComputeContainment (int doc_i, int doc_j)
203
+ {
204
+ float num_matches = (float)(doc_j > doc_i ? CountMatches (doc_i, doc_j) : CountMatches (doc_j, doc_i));
205
+ float target_trigrams = (float)(CountTrigrams (doc_j));
206
+ if (target_trigrams == 0.0) return 0.0; // check for divide by zero
207
+ return num_matches/target_trigrams;
208
+ }
209
+
210
+ bool DocumentList::IsMatchingTrigram (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2)
211
+ {
212
+ return _tuple_set.IsMatchingTuple (t0, t1, t2, doc1, doc2);
213
+ }
214
+
215
+ std::string DocumentList::MakeTrigramString (std::size_t t0, std::size_t t1, std::size_t t2)
216
+ {
217
+ std::string tuple = "";
218
+ tuple += _token_set.GetStringFor (t0);
219
+ tuple += " " + _token_set.GetStringFor (t1);
220
+ tuple += " " + _token_set.GetStringFor (t2);
221
+
222
+ return tuple;
223
+ }
224
+
225
+ std::vector<std::string> DocumentList::CollectMatchingTrigrams (int doc1, int doc2)
226
+ {
227
+ return _tuple_set.CollectMatchingTuples (doc1, doc2, _token_set);
228
+ }
229
+
@@ -0,0 +1,80 @@
1
+ #if !defined documentlist_h
2
+ #define documentlist_h
3
+
4
+ /**
5
+ * This file is part of uhferret.
6
+ *
7
+ * Author:: Peter Lane
8
+ * Copyright:: Copyright 2011, Peter Lane.
9
+ * License:: GPLv3
10
+ *
11
+ * uhferret is free software: you can redistribute it and/or modify
12
+ * it under the terms of the GNU General Public License as published by
13
+ * the Free Software Foundation, either version 3 of the License, or
14
+ * (at your option) any later version.
15
+ *
16
+ * uhferret is distributed in the hope that it will be useful,
17
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ * GNU General Public License for more details.
20
+ *
21
+ * You should have received a copy of the GNU General Public License
22
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
23
+ */
24
+
25
+ #include <assert.h>
26
+ #include <vector>
27
+
28
+ #include "tokenset.h"
29
+ #include "tupleset.h"
30
+ #include "document.h"
31
+
32
+ /** DocumentList maintains a list of documents, a TokenSet of identified Tokens and
33
+ * a TupleSet, which maps from sequences of three tokens to lists of documents
34
+ * in which the trigrams were found.
35
+ * -- Methods are provided to calculate information about pairs of documents,
36
+ * such as Resemblance and Containment.
37
+ * -- Note that the Documents are owned by this class although not created by it,
38
+ * and hence all Documents are destroyed with the DocumentList.
39
+ */
40
+ class DocumentList
41
+ {
42
+ public:
43
+ DocumentList ();
44
+ ~DocumentList ();
45
+ void AddDocument (std::string pathname, Document::DocumentType type = Document::TypeText);
46
+ void AddDocument (std::string pathname, Document::DocumentType type, int id);
47
+ Document * getDocument (std::size_t i) const;
48
+ void RemoveDocument (Document * doc);
49
+ TokenSet & GetTokenSet ();
50
+ TupleSet & GetTupleSet ();
51
+ void Clear ();
52
+ int GetNewGroupId ();
53
+ void ResetReading ();
54
+ int Size () const;
55
+ int NumberOfPairs () const;
56
+ void RunFerret (int first_document = 0);
57
+ void ReadDocument (int i);
58
+ void ClearSimilarities ();
59
+ void ComputeSimilarities ();
60
+ int GetTotalTrigramCount ();
61
+ int CountTrigrams (int doc_i);
62
+ int CountMatches (int doc_i, int doc_j);
63
+ float ComputeResemblance (int doc_i, int doc_j);
64
+ float ComputeContainment (int doc_i, int doc_j);
65
+ // check if given trigram is in both the indexed documents
66
+ bool IsMatchingTrigram (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2);
67
+ // convert given trigram into a string
68
+ std::string MakeTrigramString (std::size_t t0, std::size_t t1, std::size_t t2);
69
+ // collect all the matching trigrams in the two documents into a vector of strings
70
+ std::vector<std::string> CollectMatchingTrigrams (int doc1, int doc2);
71
+ private:
72
+ std::vector<Document *> _documents;
73
+ TokenSet _token_set;
74
+ TupleSet _tuple_set;
75
+ std::vector<int> _matches;
76
+ int _last_group_id;
77
+ };
78
+
79
+ #endif
80
+
@@ -0,0 +1,2 @@
1
+ require 'mkmf'
2
+ create_makefile('uhferret_lib')
@@ -0,0 +1,196 @@
1
+ #include "tokenreader.h"
2
+
3
+ /**
4
+ * This file is part of uhferret.
5
+ *
6
+ * Author:: Peter Lane
7
+ * Copyright:: Copyright 2011, Peter Lane.
8
+ * License:: GPLv3
9
+ *
10
+ * uhferret is free software: you can redistribute it and/or modify
11
+ * it under the terms of the GNU General Public License as published by
12
+ * the Free Software Foundation, either version 3 of the License, or
13
+ * (at your option) any later version.
14
+ *
15
+ * uhferret is distributed in the hope that it will be useful,
16
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
+ * GNU General Public License for more details.
19
+ *
20
+ * You should have received a copy of the GNU General Public License
21
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
22
+ */
23
+
24
+ TokenReader::TokenReader (std::istream & input)
25
+ : _input (input),
26
+ _position (0),
27
+ _done (false)
28
+ {}
29
+
30
+ std::size_t TokenReader::GetToken (TokenSet & tokenset)
31
+ {
32
+ return tokenset.GetIndexFor (_token.GetString ());
33
+ }
34
+
35
+ bool TokenReader::IsFinished () const
36
+ {
37
+ return _done;
38
+ }
39
+
40
+ int TokenReader::GetTokenStart () const
41
+ {
42
+ return _token_start;
43
+ }
44
+
45
+ int TokenReader::GetTokenEnd () const
46
+ {
47
+ return _token_start + _token.GetLength ();
48
+ }
49
+
50
+ // check if given character is a member of standard roman alphabet
51
+ bool WordReader::IsAlphabetChar (char ch)
52
+ {
53
+ return isalpha (ch);
54
+ }
55
+
56
+ // this function checks if the input character is from a language
57
+ // representing words as single characters. Currently, this works
58
+ // only for Chinese.
59
+ bool WordReader::IsSingleCharWord (char ch)
60
+ {
61
+ return (ch >= 0x3400 && ch < 0xa000); // check if Chinese
62
+ }
63
+
64
+ // WordReader identifies words as sequences of alphabetic characters
65
+ // -- using IsSingleCharWord, WordReader also separates out words from
66
+ // languages like Chinese which can represent a complete words as a
67
+ // single character
68
+ bool WordReader::ReadToken ()
69
+ {
70
+ if (_done) return false; // reading is done
71
+ // step to first alphabetical character
72
+ do
73
+ {
74
+ _input.get (_look);
75
+ _position++;
76
+ }
77
+ while (!IsAlphabetChar (_look) && _input.good ());
78
+ // check for finished
79
+ if (!_input.good ())
80
+ {
81
+ _done = true; // mark reading as done
82
+ return false; // return with no token read
83
+ }
84
+ // read in the alphabetical characters
85
+ _token.Erase (); // start a new token
86
+ _token_start = _position-1; // - 1 because first character is in _look
87
+ if (IsSingleCharWord (_look))
88
+ {
89
+ _token.AddChar (_look);
90
+ }
91
+ else
92
+ {
93
+ do
94
+ {
95
+ _token.AddChar (tolower (_look)); // put everything into lower case
96
+ _input.get (_look);
97
+ _position++;
98
+ }
99
+ while (IsAlphabetChar (_look) && !IsSingleCharWord (_look) && _input.good ());
100
+ _input.unget (); // replace last character, as not part of token
101
+ _position--;
102
+ }
103
+ // check for finished
104
+ if (!_input.good ()) _done = true; // mark reading as done
105
+
106
+ return true;
107
+ }
108
+
109
+ bool CCodeReader::IsSymbol (char c)
110
+ {
111
+ return ( c == '!' || c == '%' || c == '/' || c == '*' || c == '+' ||
112
+ c == '-' || c == '=' || c == '|' || c == ',' || c == '?' ||
113
+ c == '.' || c == '&' || c == '(' || c == ')' || c == '{' ||
114
+ c == '}' || c == '<' || c == '>' || c == ':' || c == ';' ||
115
+ c == '^' || c == '[' || c == ']' || c == '"' || c == '#' ||
116
+ c == '~' );
117
+ }
118
+
119
+ bool CCodeReader::IsSymbol (std::string token, char c)
120
+ {
121
+ std::string candidate = token + c;
122
+ return ( candidate == "!=" || candidate == "++" ||
123
+ candidate == "--" || candidate == "==" ||
124
+ candidate == ">=" || candidate == "<=" ||
125
+ candidate == "||" || candidate == "&&" ||
126
+ candidate == "+=" || candidate == "-=" ||
127
+ candidate == "*=" || candidate == "/=" ||
128
+ candidate == "%=" || candidate == "&=" ||
129
+ candidate == "|=" || candidate == "^=" ||
130
+ candidate == "::" || candidate == "->" ||
131
+ candidate == "//" || candidate == "<<" ||
132
+ candidate == ">>" || candidate == "##" ||
133
+ candidate == "/*" || candidate == "*/" ||
134
+ candidate == "/**" );
135
+ }
136
+
137
+ bool CCodeReader::ReadToken ()
138
+ {
139
+ if (_done) return false;
140
+ // step to first non-whitespace character
141
+ do
142
+ {
143
+ _input.get (_look);
144
+ _position++;
145
+ }
146
+ while (std::isspace (_look) && _input.good ());
147
+ // check for finished
148
+ if (!_input.good ())
149
+ {
150
+ _done = true; // mark reading as done
151
+ return false; // return with no token read
152
+ }
153
+ // read in the token
154
+ _token.Erase (); // start a new token
155
+ _token_start = _position-1; // - 1 because first character is in _look
156
+ // check for different cases -- note, precise syntax not important!
157
+ if (IsSymbol (_look))
158
+ {
159
+ // read in a symbol
160
+ do
161
+ {
162
+ _token.AddChar (_look);
163
+ _input.get (_look);
164
+ _position++;
165
+ }
166
+ while ((IsSymbol (_token.GetString (), _look)) && (_input.good ()));
167
+ }
168
+ else if (std::isdigit(_look) || _look == '.')
169
+ {
170
+ // read in a number
171
+ do
172
+ {
173
+ _token.AddChar (_look);
174
+ _input.get (_look);
175
+ _position++;
176
+ }
177
+ while ((std::isdigit (_look) || _look == '.') && (_input.good ()));
178
+ }
179
+ else
180
+ { // assume we have characters for a variable or other name
181
+ do
182
+ {
183
+ _token.AddChar (_look);
184
+ _input.get (_look);
185
+ _position++;
186
+ }
187
+ while ((std::isalnum (_look) || _look == '_') && (_input.good ()));
188
+ }
189
+ _input.unget (); // replace last character, as not part of token
190
+ _position--;
191
+ // check for finished
192
+ if (!_input.good ()) _done = true; // mark reading as done
193
+
194
+ return true;
195
+ }
196
+