RubyGems - uhferret - Versions diffs - 1.3.7 - Mend

uhferret 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +7 -0
data/COPYING.txt +674 -0
data/README.rdoc +79 -0
data/bin/uhferret +129 -0
data/bin/uhferret-server +68 -0
data/ext/document.cpp +231 -0
data/ext/document.h +89 -0
data/ext/documentlist.cpp +229 -0
data/ext/documentlist.h +80 -0
data/ext/extconf.rb +2 -0
data/ext/tokenreader.cpp +196 -0
data/ext/tokenreader.h +85 -0
data/ext/tokenset.cpp +111 -0
data/ext/tokenset.h +73 -0
data/ext/tupleset.cpp +150 -0
data/ext/tupleset.h +92 -0
data/ext/uhferret_lib_wrap.cxx +10726 -0
data/lib/uhferret.rb +441 -0
data/lib/utils.rb +93 -0
data/lib/webferret.rb +246 -0
metadata +71 -0

data/ext/document.h ADDED

@@ -0,0 +1,89 @@
+#if !defined document_h
+#define document_h
+/**
+ * This file is part of uhferret.
+ *
+ * Author::    Peter Lane
+ * Copyright:: Copyright 2011, Peter Lane.
+ * License::   GPLv3
+ *
+ * uhferret is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * uhferret is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <cctype>
+#include <fstream>
+#include <istream>
+#include <string>
+#include "tokenset.h"
+#include "tokenreader.h"
+/** Document points to a document on the local filestore.
+  * -- each Document is initialised with a pathname and the type of a document
+  *    or it may take these values from a given Document
+  * -- the group_id is used to place documents into groups: documents with the same id
+  *    will not be compared against each other
+  * -- Document owns a TokenReader, which is created on heap during initialisation
+  * -- the important part of the class is the set of methods for iterating
+  *    across the trigrams, using ReadTrigram, GetTrigramStart/End and GetToken
+  */
+class Document
+{
+	public:
+		enum DocumentType { TypeText, TypeCode };
+		Document (std::string pathname, DocumentType type = TypeText, int id = 0);
+		Document (Document * document);
+		// basic information about document: its type and names
+		void SetType (DocumentType type);
+		bool IsTextType () const;
+    std::string GetPathname () const;
+		void SetPathname (std::string pathname);
+		// accessor/setter for group_id
+		int GetGroupId () const;
+		void SetGroupId (int id);
+		// information about trigrams in document
+		int GetTrigramCount () const;
+		void SetTrigramCount (int count);
+		void ResetTrigramCount ();
+		void IncrementTrigramCount ();
+		// following methods used to start, read and end processing of trigrams
+		bool StartInput (TokenSet & tokenset);
+		bool StartInput (std::istream & input, TokenSet & tokenset);
+		bool ReadTrigram (TokenSet & tokenset);
+		std::size_t GetToken (int i) const;		// access token of current trigram
+		std::size_t GetTrigramStart () const;		// access start position of trigram
+		std::size_t GetTrigramStart (int i) const;	// access start of token i in trigram
+		std::size_t GetTrigramEnd () const;		// access end position of trigram
+		void CloseInput ();
+		// following methods check the type of the document based on its filename
+		bool IsCodeType () const;
+		bool IsTxtType () const;
+		bool IsUnknownType () const;
+	private:
+		bool IsFileType (std::string extension) const;
+    std::string StringToUpper (std::string) const;
+		void InitialiseInput (TokenSet & tokenset);
+		std::string	  _pathname; 		// -- source for this document
+		DocumentType	  _type;
+		int 		  _num_trigrams;
+    std::ifstream * _fb;
+    std::istream * _cin;
+		TokenReader 	* _token_input; // this is a pointer, because initialised separately
+		std::size_t	  _current_tuple[3];
+		std::size_t	  _current_start[3];
+		int		  _group_id;	// an index number indicating this document's group
+};
+#endif

data/ext/documentlist.cpp ADDED

@@ -0,0 +1,229 @@
+#include "documentlist.h"
+/**
+ * This file is part of uhferret.
+ *
+ * Author::    Peter Lane
+ * Copyright:: Copyright 2011, Peter Lane.
+ * License::   GPLv3
+ *
+ * uhferret is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * uhferret is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+ */
+DocumentList::DocumentList () :
+  _last_group_id (0)
+{}
+DocumentList::~DocumentList ()
+{
+	Clear ();
+}
+void DocumentList::AddDocument (std::string pathname, Document::DocumentType type)
+{
+	_documents.push_back (new Document (pathname, type, GetNewGroupId ()));
+}
+void DocumentList::AddDocument (std::string pathname, Document::DocumentType type, int id)
+{
+	_documents.push_back (new Document (pathname, type, id));
+}
+Document * DocumentList::getDocument (std::size_t i) const
+{
+	assert (i >= 0 && i < _documents.size ());
+	return _documents[i];
+}
+void DocumentList::RemoveDocument (Document * doc)
+{
+	for (std::vector<Document *>::iterator it = _documents.begin();
+		it != _documents.end();
+		++it)
+	{
+		if (*it == doc)
+		{
+			_documents.erase (it);
+			return;
+		}
+	}
+}
+TokenSet & DocumentList::GetTokenSet ()
+{
+	return _token_set;
+}
+TupleSet & DocumentList::GetTupleSet ()
+{
+	return _tuple_set;
+}
+// A Document list owns the documents, so is responsible for deleting them
+void DocumentList::Clear ()
+{
+	for (int i=0, n=_documents.size(); i<n; ++i)
+	{
+		delete _documents[i];
+	}
+	_documents.clear ();
+	ResetReading ();
+}
+// return a new, unique group id.
+// note that id = 0 has special meaning (files downloaded from web)
+int DocumentList::GetNewGroupId ()
+{
+	_last_group_id += 1;
+	return _last_group_id;
+}
+void DocumentList::ResetReading ()
+{
+	_token_set.Clear ();
+	_tuple_set.Clear ();
+	_matches.clear ();
+}
+int DocumentList::Size () const
+{
+	return _documents.size ();
+}
+// don't count pairs of documents in same group
+int DocumentList::NumberOfPairs () const
+{
+	int num_pairs = 0;
+	for (int i = 0; i < _documents.size (); ++i)
+		for (int j = i+1; j < _documents.size (); ++j)
+		{
+			if (_documents[i]->GetGroupId () != _documents[j]->GetGroupId ())
+				num_pairs++;
+		}
+	return num_pairs;
+}
+void DocumentList::RunFerret (int first_document)
+{
+  ResetReading ();
+	// phase 1 -- read each file in turn, finding trigrams
+	for (int i = first_document; i < _documents.size (); ++i)
+	{
+		ReadDocument (i);
+	}
+	// phase 2 -- compute the similarities
+	ComputeSimilarities ();
+}
+void DocumentList::ReadDocument (int i)
+{
+	_documents[i]->StartInput (_token_set);
+	_documents[i]->ResetTrigramCount ();
+	while ( _documents[i]->ReadTrigram (_token_set) )
+	{
+		if (_tuple_set.AddDocument (
+					_documents[i]->GetToken (0),
+					_documents[i]->GetToken (1),
+					_documents[i]->GetToken (2),
+					i))
+		{
+			_documents[i]->IncrementTrigramCount ();
+		}
+	}
+	_documents[i]->CloseInput ();
+}
+void DocumentList::ClearSimilarities ()
+{
+	for (int i=0; i < _documents.size() * _documents.size(); ++i)
+	{
+		_matches.push_back (0);
+	}
+}
+void DocumentList::ComputeSimilarities ()
+{
+	ClearSimilarities ();
+	for (_tuple_set.Begin (); _tuple_set.HasMore (); _tuple_set.GetNext ())
+	{
+		const std::vector<int> & fvector = _tuple_set.GetDocumentsForCurrentTuple ();
+		// take each pair of documents in the vector, and add one to matches
+		for (unsigned int fi = 0, n = fvector.size (); fi < n; ++fi)
+		{
+			for (unsigned int fj=fi+1; fj < n; ++fj)
+			{
+				// ensure that first index is smaller than the second
+				int doc1 = fvector[(fi <= fj ? fi : fj)];
+				int doc2 = fvector[(fi <= fj ? fj : fi)];
+				assert (doc1 * _documents.size() + doc2 < _matches.size());
+				_matches[doc1 * _documents.size() + doc2]++;
+			}
+		}
+	}
+}
+int DocumentList::GetTotalTrigramCount ()
+{
+	return _tuple_set.Size ();
+}
+int DocumentList::CountTrigrams (int doc_i)
+{
+	return _documents[doc_i]->GetTrigramCount ();
+}
+int DocumentList::CountMatches (int doc_i, int doc_j)
+{
+	assert (doc_j > doc_i); // _matches is only completed from one side, with doc_j > doc_i
+	assert ((doc_i * _documents.size() + doc_j) < _matches.size());
+	return _matches[doc_i * _documents.size() + doc_j];
+}
+float DocumentList::ComputeResemblance (int doc_i, int doc_j)
+{
+	float num_matches = (float)CountMatches (doc_i, doc_j);
+	float total_trigrams = (float)(CountTrigrams (doc_i) + CountTrigrams (doc_j) - CountMatches (doc_i, doc_j));
+	if (total_trigrams == 0.0) return 0.0; // check for divide by zero
+	return num_matches/total_trigrams;
+}
+float DocumentList::ComputeContainment (int doc_i, int doc_j)
+{
+	float num_matches = (float)(doc_j > doc_i ? CountMatches (doc_i, doc_j) : CountMatches (doc_j, doc_i));
+	float target_trigrams = (float)(CountTrigrams (doc_j));
+	if (target_trigrams == 0.0) return 0.0; // check for divide by zero
+	return num_matches/target_trigrams;
+}
+bool DocumentList::IsMatchingTrigram (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2)
+{
+	return _tuple_set.IsMatchingTuple (t0, t1, t2, doc1, doc2);
+}
+std::string DocumentList::MakeTrigramString (std::size_t t0, std::size_t t1, std::size_t t2)
+{
+	std::string tuple = "";
+	tuple += _token_set.GetStringFor (t0);
+	tuple += " " + _token_set.GetStringFor (t1);
+	tuple += " " + _token_set.GetStringFor (t2);
+	return tuple;
+}
+std::vector<std::string> DocumentList::CollectMatchingTrigrams (int doc1, int doc2)
+{
+	return _tuple_set.CollectMatchingTuples (doc1, doc2, _token_set);
+}

data/ext/documentlist.h ADDED

@@ -0,0 +1,80 @@
+#if !defined documentlist_h
+#define documentlist_h
+/**
+ * This file is part of uhferret.
+ *
+ * Author::    Peter Lane
+ * Copyright:: Copyright 2011, Peter Lane.
+ * License::   GPLv3
+ *
+ * uhferret is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * uhferret is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <assert.h>
+#include <vector>
+#include "tokenset.h"
+#include "tupleset.h"
+#include "document.h"
+/** DocumentList maintains a list of documents, a TokenSet of identified Tokens and
+  *    a TupleSet, which maps from sequences of three tokens to lists of documents
+  *    in which the trigrams were found.
+  * -- Methods are provided to calculate information about pairs of documents,
+  *    such as Resemblance and Containment.
+  * -- Note that the Documents are owned by this class although not created by it,
+  *    and hence all Documents are destroyed with the DocumentList.
+  */
+class DocumentList
+{
+	public:
+		DocumentList ();
+		~DocumentList ();
+		void AddDocument (std::string pathname, Document::DocumentType type = Document::TypeText);
+		void AddDocument (std::string pathname, Document::DocumentType type, int id);
+    Document * getDocument (std::size_t i) const;
+		void RemoveDocument (Document * doc);
+		TokenSet & GetTokenSet ();
+		TupleSet & GetTupleSet ();
+		void Clear ();
+		int GetNewGroupId ();
+		void ResetReading ();
+		int Size () const;
+		int NumberOfPairs () const;
+		void RunFerret (int first_document = 0);
+		void ReadDocument (int i);
+		void ClearSimilarities ();
+		void ComputeSimilarities ();
+		int GetTotalTrigramCount ();
+		int CountTrigrams (int doc_i);
+		int CountMatches (int doc_i, int doc_j);
+		float ComputeResemblance (int doc_i, int doc_j);
+		float ComputeContainment (int doc_i, int doc_j);
+		// check if given trigram is in both the indexed documents
+		bool IsMatchingTrigram (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2);
+		// convert given trigram into a string
+		std::string MakeTrigramString (std::size_t t0, std::size_t t1, std::size_t t2);
+		// collect all the matching trigrams in the two documents into a vector of strings
+    std::vector<std::string> CollectMatchingTrigrams (int doc1, int doc2);
+	private:
+		std::vector<Document *>	_documents;
+		TokenSet		_token_set;
+		TupleSet		_tuple_set;
+		std::vector<int>	_matches;
+		int			_last_group_id;
+};
+#endif

data/ext/extconf.rb ADDED

	@@ -0,0 +1,2 @@
1	+ require 'mkmf'
2	+ create_makefile('uhferret_lib')

data/ext/tokenreader.cpp ADDED

@@ -0,0 +1,196 @@
+#include "tokenreader.h"
+/**
+ * This file is part of uhferret.
+ *
+ * Author::    Peter Lane
+ * Copyright:: Copyright 2011, Peter Lane.
+ * License::   GPLv3
+ *
+ * uhferret is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * uhferret is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+ */
+TokenReader::TokenReader (std::istream & input)
+	: _input (input),
+	  _position (0),
+	  _done (false)
+{}
+std::size_t TokenReader::GetToken (TokenSet & tokenset)
+{
+	return tokenset.GetIndexFor (_token.GetString ());
+}
+bool TokenReader::IsFinished () const
+{
+	return _done;
+}
+int TokenReader::GetTokenStart () const
+{
+	return _token_start;
+}
+int TokenReader::GetTokenEnd () const
+{
+	return _token_start + _token.GetLength ();
+}
+// check if given character is a member of standard roman alphabet
+bool WordReader::IsAlphabetChar (char ch)
+{
+	return isalpha (ch);
+}
+// this function checks if the input character is from a language
+// representing words as single characters.  Currently, this works
+// only for Chinese.
+bool WordReader::IsSingleCharWord (char ch)
+{
+	return (ch >= 0x3400 && ch < 0xa000); // check if Chinese
+}
+// WordReader identifies words as sequences of alphabetic characters
+//  -- using IsSingleCharWord, WordReader also separates out words from
+//     languages like Chinese which can represent a complete words as a
+//     single character
+bool WordReader::ReadToken ()
+{
+	if (_done) return false;	// reading is done
+	// step to first alphabetical character
+	do
+	{
+		_input.get (_look);
+		_position++;
+	}
+	while (!IsAlphabetChar (_look) && _input.good ());
+	// check for finished
+	if (!_input.good ())
+	{
+		_done = true;	// mark reading as done
+		return false;	// return with no token read
+	}
+	// read in the alphabetical characters
+	_token.Erase ();		// start a new token
+	_token_start = _position-1;	// - 1 because first character is in _look
+	if (IsSingleCharWord (_look))
+	{
+		_token.AddChar (_look);
+	}
+	else
+	{
+		do
+		{
+			_token.AddChar (tolower (_look)); // put everything into lower case
+			_input.get (_look);
+			_position++;
+		}
+		while (IsAlphabetChar (_look) && !IsSingleCharWord (_look) && _input.good ());
+		_input.unget ();	// replace last character, as not part of token
+		_position--;
+	}
+	// check for finished
+	if (!_input.good ()) _done = true;	// mark reading as done
+	return true;
+}
+bool CCodeReader::IsSymbol (char c)
+{
+	return ( c == '!' || c == '%' || c == '/' || c == '*' || c == '+' ||
+		 c == '-' || c == '=' || c == '|' || c == ',' || c == '?' ||
+		 c == '.' || c == '&' || c == '(' || c == ')' || c == '{' ||
+		 c == '}' || c == '<' || c == '>' || c == ':' || c == ';' ||
+		 c == '^' || c == '[' || c == ']' || c == '"' || c == '#' ||
+		 c == '~' );
+}
+bool CCodeReader::IsSymbol (std::string token, char c)
+{
+  std::string candidate = token + c;
+	return ( candidate == "!=" || candidate == "++" ||
+		 candidate == "--" || candidate == "==" ||
+		 candidate == ">=" || candidate == "<=" ||
+		 candidate == "||" || candidate == "&&" ||
+		 candidate == "+=" || candidate == "-=" ||
+		 candidate == "*=" || candidate == "/=" ||
+		 candidate == "%=" || candidate == "&=" ||
+		 candidate == "|=" || candidate == "^=" ||
+		 candidate == "::" || candidate == "->" ||
+		 candidate == "//" || candidate == "<<" ||
+		 candidate == ">>" || candidate == "##" ||
+		 candidate == "/*" || candidate == "*/" ||
+		 candidate == "/**" );
+}
+bool CCodeReader::ReadToken ()
+{
+	if (_done) return false;
+	// step to first non-whitespace character
+	do
+	{
+		_input.get (_look);
+		_position++;
+	}
+	while (std::isspace (_look) && _input.good ());
+	// check for finished
+	if (!_input.good ())
+	{
+		_done = true;	// mark reading as done
+		return false;	// return with no token read
+	}
+	// read in the token
+	_token.Erase ();		// start a new token
+	_token_start = _position-1;	// - 1 because first character is in _look
+	// check for different cases -- note, precise syntax not important!
+	if (IsSymbol (_look))
+	{
+		// read in a symbol
+		do
+		{
+			_token.AddChar (_look);
+			_input.get (_look);
+			_position++;
+		}
+		while ((IsSymbol (_token.GetString (), _look)) && (_input.good ()));
+	}
+	else if (std::isdigit(_look) || _look == '.')
+	{
+		// read in a number
+		do
+		{
+			_token.AddChar (_look);
+			_input.get (_look);
+			_position++;
+		}
+		while ((std::isdigit (_look) || _look == '.') && (_input.good ()));
+	}
+	else
+	{ // assume we have characters for a variable or other name
+		do
+		{
+			_token.AddChar (_look);
+			_input.get (_look);
+			_position++;
+		}
+		while ((std::isalnum (_look) || _look == '_') && (_input.good ()));
+	}
+	_input.unget (); // replace last character, as not part of token
+	_position--;
+	// check for finished
+	if (!_input.good ()) _done = true;	// mark reading as done
+	return true;
+}