RubyGems - uhferret - Versions diffs - 1.3.7 - Mend

uhferret 1.3.7

Files changed (21) hide show

checksums.yaml +7 -0
data/COPYING.txt +674 -0
data/README.rdoc +79 -0
data/bin/uhferret +129 -0
data/bin/uhferret-server +68 -0
data/ext/document.cpp +231 -0
data/ext/document.h +89 -0
data/ext/documentlist.cpp +229 -0
data/ext/documentlist.h +80 -0
data/ext/extconf.rb +2 -0
data/ext/tokenreader.cpp +196 -0
data/ext/tokenreader.h +85 -0
data/ext/tokenset.cpp +111 -0
data/ext/tokenset.h +73 -0
data/ext/tupleset.cpp +150 -0
data/ext/tupleset.h +92 -0
data/ext/uhferret_lib_wrap.cxx +10726 -0
data/lib/uhferret.rb +441 -0
data/lib/utils.rb +93 -0
data/lib/webferret.rb +246 -0
metadata +71 -0

@@ -0,0 +1,85 @@
+#if !defined tokenreader_h
+#define tokenreader_h
+/**
+ * This file is part of uhferret.
+ *
+ * Author::    Peter Lane
+ * Copyright:: Copyright 2011, Peter Lane.
+ * License::   GPLv3
+ *
+ * uhferret is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * uhferret is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <ctype.h> // gives tests for if characters are numbers, alphanumerics, etc
+#include <istream>
+#include "tokenset.h"
+/** The TokenReader is the parent class of the different 'token-isers'
+  * -- WordReader tokenises a document into strings of alphanumeric characters
+  * -- CCodeReader tokenises a document into symbols matching a C-style language
+  * The token reader is initialised with an input stream
+  * -- GetToken is used to 'walk through' the document, one token at a time
+  *    until IsFinished returns true.
+  * -- the start and end points of the token can be retrieved using the given methods,
+  *    and the string making up the token can be obtained by caller from the TokenSet
+  */
+class TokenReader
+{
+	public:
+		TokenReader (std::istream & input);
+		// return index of last read token
+		std::size_t GetToken (TokenSet & tokenset); // retrieve current token identifier
+		bool IsFinished () const;	// return true if end-of-file reached
+		int GetTokenStart () const;	// return the start position of current token
+		int GetTokenEnd () const;	// return the end position of current token
+		// read token, return true if successful
+		// -- user of class must provide this method
+		virtual bool ReadToken () = 0;
+	protected: // allow subclasses to access parameters
+    std::istream & _input;   // the stream from which to read
+		int 		_position; // current position in stream
+		Token		_token;    // last token read
+		int		_token_start;	// start position of last token read
+		char		_look;	   // lookahead character
+		bool		_done;	   // becomes true when stream is completed
+};
+// The WordReader separates its input stream into tokens, consisting of
+//     consecutive alphabetic characters
+//     -- every character is converted to lower case
+class WordReader: public TokenReader
+{
+	public:
+		WordReader (std::istream & input) : TokenReader (input) {}
+		bool IsAlphabetChar (char ch);
+		bool IsSingleCharWord (char ch);
+		bool ReadToken ();
+};
+// The CCodeReader separates its input stream into tokens, looking for
+//     C-style tokens, numbers and symbols
+class CCodeReader: public TokenReader
+{
+	public:
+		CCodeReader (std::istream & input) : TokenReader (input) {}
+		bool ReadToken ();
+	private:
+		bool IsSymbol (char c);
+		bool IsSymbol (std::string token, char c);
+};
+#endif

data/ext/tokenset.cpp ADDED

@@ -0,0 +1,111 @@
+#include "tokenset.h"
+/**
+ * This file is part of uhferret.
+ *
+ * Author::    Peter Lane
+ * Copyright:: Copyright 2011, Peter Lane.
+ * License::   GPLv3
+ *
+ * uhferret is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * uhferret is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+ */
+Token::Token ()
+	: _capacity (2), _top (0)
+{
+	_token = new char [_capacity];
+}
+Token::~Token ()
+{
+	delete[] _token;
+}
+void Token::Erase ()
+{
+	_top = 0;
+}
+void Token::AddChar (char c)
+{
+	if (_top == _capacity)
+		Grow ();
+	_token [_top] = c;
+	++_top;
+}
+std::string Token::GetString () const
+{
+	return std::string (_token, _top);
+}
+int Token::GetLength () const
+{
+	return _top;
+}
+void Token::Grow ()
+{
+	char * newtoken = new char [2 * _capacity];
+	for (int i = 0; i < _capacity; ++i)
+		newtoken[i] = _token[i];
+	_capacity = 2 * _capacity;
+	delete _token;
+	_token = newtoken;
+}
+// *** TokenSet
+TokenSet::TokenSet ()
+	: _nextindex (0)
+{}
+std::size_t TokenSet::GetIndexFor (std::string token)
+{
+	_tokens_it = _tokens.find (token);
+	if (_tokens_it != _tokens.end())  // found it
+		return _tokens_it->second;
+	else // otherwise, make a new index
+	{
+		_tokens[token] = _nextindex;
+		_strings[_nextindex] = token;
+		_nextindex++;
+		return _nextindex-1;
+	}
+}
+std::string TokenSet::GetStringFor (std::size_t token)
+{
+	_strings_it = _strings.find (token);
+	assert (_strings_it != _strings.end ()); // it's an error if token not in token set
+	return std::string (_strings_it->second.c_str ());
+}
+void TokenSet::Clear ()
+{
+	_tokens.clear ();
+	_strings.clear ();
+	_nextindex = 0;
+}
+void TokenSet::SetNextIndex (int index)
+{
+	_nextindex = index;
+}
+void TokenSet::SetIndexString (std::string token, int index)
+{
+	_strings[index] = token;
+	_tokens[token] = index;
+}

data/ext/tokenset.h ADDED

@@ -0,0 +1,73 @@
+#if !defined tokenset_h
+#define tokenset_h
+/**
+ * This file is part of uhferret.
+ *
+ * Author::    Peter Lane
+ * Copyright:: Copyright 2011, Peter Lane.
+ * License::   GPLv3
+ *
+ * uhferret is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * uhferret is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <assert.h>
+#include <map>
+#include <string>
+#include <vector>
+/** A Token is a sequence of characters read in by a TokenReader
+  * -- this class provides a dynamic storage for the token supporting
+  *    addition of characters
+  * -- when finished, the token can be queried for its length and made into a string
+*/
+class Token
+{
+	public:
+		Token ();
+		~Token ();
+		void Erase ();
+		void AddChar (char c);
+    std::string GetString () const;
+		int GetLength () const;
+	private:
+		void Grow ();
+		char * _token; // storage for the token
+		int	_capacity; // size of the stored token
+		int 	_top;	// pointer to end of token
+};
+/** A TokenSet maps strings to token indices
+  * -- this is for memory efficiency, ensuring every token's string is
+  *    stored once within the application
+  */
+class TokenSet
+{
+	public:
+		TokenSet ();
+		std::size_t GetIndexFor (std::string token);
+		std::string GetStringFor (std::size_t token);
+		void Clear ();
+		void SetNextIndex (int index);
+		void SetIndexString (std::string token, int index);
+	private:
+		std::map<std::string, std::size_t> _tokens;
+		std::map<std::string, std::size_t>::const_iterator _tokens_it;
+		std::size_t _nextindex; // next free index for new string
+		std::map<std::size_t, std::string> _strings;
+		std::map<std::size_t, std::string>::const_iterator _strings_it;
+};
+#endif

data/ext/tupleset.cpp ADDED

@@ -0,0 +1,150 @@
+#include "tupleset.h"
+/**
+ * This file is part of uhferret.
+ *
+ * Author::    Peter Lane
+ * Copyright:: Copyright 2011, Peter Lane.
+ * License::   GPLv3
+ *
+ * uhferret is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * uhferret is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+ */
+TupleSet::TupleSet ()
+{}
+void TupleSet::Clear ()
+{
+	_tuple_map.clear ();
+}
+int TupleSet::Size ()
+{
+	int trigram_count = 0;
+	for (Begin (); HasMore (); GetNext ())
+	{
+		trigram_count++;
+	}
+	return trigram_count;
+}
+std::vector<int> & TupleSet::GetDocumentsForTuple (std::size_t token_0, std::size_t token_1, std::size_t token_2)
+{
+	return _tuple_map[token_0][token_1][token_2];
+}
+bool TupleSet::AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2, int document)
+{
+	bool has_doc = false;
+	std::vector<int> & fvector = _tuple_map[token_0][token_1][token_2];
+	// check if document is already in the trigram
+	for (int i = 0, n = fvector.size(); i < n; ++i)
+	{
+		if (fvector[i] == document)
+		{
+			has_doc = true;
+			break;
+		}
+	}
+	if (!has_doc) // didn't have document, so add it
+	{
+		fvector.push_back (document);
+		return true;  // indicate that document added
+	}
+	return false;
+}
+bool TupleSet::IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2)
+{
+	std::vector<int> fvector = GetDocumentsForTuple (t0, t1, t2);
+	bool has_doc1 = false;
+	bool has_doc2 = false;
+	for (int i=0, n=fvector.size(); i<n; ++i)
+	{
+		if (fvector[i] == doc1) has_doc1 = true;
+		if (fvector[i] == doc2) has_doc2 = true;
+	}
+	return ( has_doc1 && has_doc2 );
+}
+std::vector<std::string> TupleSet::CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset)
+{
+  std::vector<std::string> tuples;
+	for (Begin (); HasMore (); GetNext ())
+	{
+		if (IsMatchingTuple (GetToken (0), GetToken (1), GetToken (2),
+					doc1, doc2))
+		{
+			tuples.push_back (GetStringForCurrentTuple (tokenset));
+		}
+	}
+	return tuples;
+}
+void TupleSet::Begin ()
+{
+	_ti = _tuple_map.begin ();
+	_pi = (_ti->second).begin ();
+	_wi = (_pi->second).begin ();
+}
+void TupleSet::GetNext ()
+{
+	_wi++; // move to next word position
+	if (_wi == (_pi->second).end ())  // if words have finished, then move to next pair position
+	{
+		_pi++;
+		if (_pi == (_ti->second).end ()) // if pairs have finished, then move to next triple position
+		{
+			_ti++;
+			if (_ti == _tuple_map.end ()) return; // finished
+			_pi = (_ti->second).begin ();  // get next pair iterator
+		}
+		_wi = (_pi->second).begin ();  // get next word iterator
+	}
+}
+bool TupleSet::HasMore () const
+{
+	return _ti != _tuple_map.end ();
+}
+std::vector<int> & TupleSet::GetDocumentsForCurrentTuple ()
+{
+	return GetDocumentsForTuple (_ti->first, _pi->first, _wi->first);
+}
+std::string TupleSet::GetStringForCurrentTuple (TokenSet & tokenset) const
+{
+  std::string tuple = "";
+	tuple += tokenset.GetStringFor (_ti->first);
+	tuple += " " + tokenset.GetStringFor (_pi->first);
+	tuple += " " + tokenset.GetStringFor (_wi->first);
+	return tuple;
+}
+std::size_t TupleSet::GetToken (int i) const
+{
+	assert (i>=0 && i<=2);
+	if (i == 0)
+		return _ti->first;
+	else if (i == 1)
+		return _pi->first;
+	else // if (i == 2)
+		return _wi->first;
+}

data/ext/tupleset.h ADDED

@@ -0,0 +1,92 @@
+#if !defined tupleset_h
+#define tupleset_h
+/**
+ * This file is part of uhferret.
+ * Initial triple map idea by Bob Dickerson.
+ *
+ * Author::    Peter Lane
+ * Copyright:: Copyright 2011, Peter Lane.
+ * License::   GPLv3
+ *
+ * uhferret is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * uhferret is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <assert.h>
+#include <map>
+#include <string>
+#include <vector>
+#include "tokenset.h"
+/** TupleSet maintains the database mapping trigrams to identifier of documents which contain them.
+  * The mapping is held as a sequence of std::maps, each map taking a std::size_t reference to
+  * a token as a key.  The end result of the three maps is a vector of document identifiers.
+  *
+  * The most important feature of the TupleSet is the collection of methods for iterating over
+  * all tuples in the TupleSet.
+  * e.g. with the definition:  TupleSet tuple_set;
+  * use:                       for (tuple_set.Begin (); tuple_set.HasMore (); tuple_set.GetNext ())
+  *                            {}
+  * to iterate over all the tuples.  The methods: GetDocumentsForCurrentTuple, GetStringForCurrentTuple,
+  * and GetToken0, GetToken1, GetToken2 return information on the current tuple.
+  */
+class TupleSet
+{
+	// typedef's to simplify declarations
+	typedef std::map<std::size_t, std::vector<int> > WordMap;
+	typedef WordMap::const_iterator WordMapIter;
+	typedef std::map<std::size_t, WordMap> PairMap;
+	typedef PairMap::const_iterator PairMapIter;
+	typedef std::map<std::size_t, PairMap> TripMap;
+	typedef TripMap::const_iterator TripMapIter;
+	public:
+		TupleSet ();
+		void Clear ();
+		int Size ();
+		// given a tuple, return the list of documents which contain that tuple
+		std::vector<int> & GetDocumentsForTuple (std::size_t token_0,
+				std::size_t token_1, std::size_t token_2);
+		// given a tuple and a document identifier,
+		// - make sure that the document is in the list for that tuple
+		// - returns true if the document was not already in trigram's list
+		bool AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2,
+				int document);
+		// check if two documents share the given tuple
+		bool IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2);
+		// collect and return all tuples in the two given documents
+    std::vector<std::string> CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset);
+	private:
+		TripMap	_tuple_map;
+	public: // following methods and data structures are to handle an iterator on tupleset
+		void Begin ();			// start the iterator
+		void GetNext ();		// advance the iterator
+		bool HasMore () const;		// check for end
+		// retrieve current tuple's documents
+		std::vector<int> & GetDocumentsForCurrentTuple ();
+		// retrieve string for current tuple
+    std::string GetStringForCurrentTuple (TokenSet & tokenset) const;
+		// retrieve identifiers for individual tokens
+		std::size_t GetToken (int i) const;
+	private:
+		TripMapIter	_ti;	// iterator from first token to pairs
+		PairMapIter	_pi;	// iterator from second token to words
+		WordMapIter	_wi;	// iterator from third token to document list
+};
+#endif