RubyGems - uhferret - Versions diffs - 1.3.7 - Mend

uhferret 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml +7 -0
data/COPYING.txt +674 -0
data/README.rdoc +79 -0
data/bin/uhferret +129 -0
data/bin/uhferret-server +68 -0
data/ext/document.cpp +231 -0
data/ext/document.h +89 -0
data/ext/documentlist.cpp +229 -0
data/ext/documentlist.h +80 -0
data/ext/extconf.rb +2 -0
data/ext/tokenreader.cpp +196 -0
data/ext/tokenreader.h +85 -0
data/ext/tokenset.cpp +111 -0
data/ext/tokenset.h +73 -0
data/ext/tupleset.cpp +150 -0
data/ext/tupleset.h +92 -0
data/ext/uhferret_lib_wrap.cxx +10726 -0
data/lib/uhferret.rb +441 -0
data/lib/utils.rb +93 -0
data/lib/webferret.rb +246 -0
metadata +71 -0

data/ext/tokenreader.h ADDED

@@ -0,0 +1,85 @@
+#if !defined tokenreader_h
+#define tokenreader_h
+/**
+ * This file is part of uhferret.
+ *
+ * Author::    Peter Lane
+ * Copyright:: Copyright 2011, Peter Lane.
+ * License::   GPLv3
+ *
+ * uhferret is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * uhferret is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <ctype.h> // gives tests for if characters are numbers, alphanumerics, etc
+#include <istream>
+#include "tokenset.h"
+/** The TokenReader is the parent class of the different 'token-isers'
+  * -- WordReader tokenises a document into strings of alphanumeric characters
+  * -- CCodeReader tokenises a document into symbols matching a C-style language
+  * The token reader is initialised with an input stream
+  * -- GetToken is used to 'walk through' the document, one token at a time
+  *    until IsFinished returns true.
+  * -- the start and end points of the token can be retrieved using the given methods,
+  *    and the string making up the token can be obtained by caller from the TokenSet
+  */
+class TokenReader
+{
+	public:
+		TokenReader (std::istream & input);
+		// return index of last read token
+		std::size_t GetToken (TokenSet & tokenset); // retrieve current token identifier
+		bool IsFinished () const;	// return true if end-of-file reached
+		int GetTokenStart () const;	// return the start position of current token
+		int GetTokenEnd () const;	// return the end position of current token
+		// read token, return true if successful
+		// -- user of class must provide this method
+		virtual bool ReadToken () = 0;
+	protected: // allow subclasses to access parameters
+    std::istream & _input;   // the stream from which to read
+		int 		_position; // current position in stream
+		Token		_token;    // last token read
+		int		_token_start;	// start position of last token read
+		char		_look;	   // lookahead character
+		bool		_done;	   // becomes true when stream is completed
+};
+// The WordReader separates its input stream into tokens, consisting of
+//     consecutive alphabetic characters
+//     -- every character is converted to lower case
+class WordReader: public TokenReader
+{
+	public:
+		WordReader (std::istream & input) : TokenReader (input) {}
+		bool IsAlphabetChar (char ch);
+		bool IsSingleCharWord (char ch);
+		bool ReadToken ();
+};
+// The CCodeReader separates its input stream into tokens, looking for
+//     C-style tokens, numbers and symbols
+class CCodeReader: public TokenReader
+{
+	public:
+		CCodeReader (std::istream & input) : TokenReader (input) {}
+		bool ReadToken ();
+	private:
+		bool IsSymbol (char c);
+		bool IsSymbol (std::string token, char c);
+};
+#endif

data/ext/tokenset.cpp ADDED

@@ -0,0 +1,111 @@
+#include "tokenset.h"
+/**
+ * This file is part of uhferret.
+ *
+ * Author::    Peter Lane
+ * Copyright:: Copyright 2011, Peter Lane.
+ * License::   GPLv3
+ *
+ * uhferret is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * uhferret is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+ */
+Token::Token ()
+	: _capacity (2), _top (0)
+{
+	_token = new char [_capacity];
+}
+Token::~Token ()
+{
+	delete[] _token;
+}
+void Token::Erase ()
+{
+	_top = 0;
+}
+void Token::AddChar (char c)
+{
+	if (_top == _capacity)
+		Grow ();
+	_token [_top] = c;
+	++_top;
+}
+std::string Token::GetString () const
+{
+	return std::string (_token, _top);
+}
+int Token::GetLength () const
+{
+	return _top;
+}
+void Token::Grow ()
+{
+	char * newtoken = new char [2 * _capacity];
+	for (int i = 0; i < _capacity; ++i)
+		newtoken[i] = _token[i];
+	_capacity = 2 * _capacity;
+	delete _token;
+	_token = newtoken;
+}
+// *** TokenSet
+TokenSet::TokenSet ()
+	: _nextindex (0)
+{}
+std::size_t TokenSet::GetIndexFor (std::string token)
+{
+	_tokens_it = _tokens.find (token);
+	if (_tokens_it != _tokens.end())  // found it
+		return _tokens_it->second;
+	else // otherwise, make a new index
+	{
+		_tokens[token] = _nextindex;
+		_strings[_nextindex] = token;
+		_nextindex++;
+		return _nextindex-1;
+	}
+}
+std::string TokenSet::GetStringFor (std::size_t token)
+{
+	_strings_it = _strings.find (token);
+	assert (_strings_it != _strings.end ()); // it's an error if token not in token set
+	return std::string (_strings_it->second.c_str ());
+}
+void TokenSet::Clear ()
+{
+	_tokens.clear ();
+	_strings.clear ();
+	_nextindex = 0;
+}
+void TokenSet::SetNextIndex (int index)
+{
+	_nextindex = index;
+}
+void TokenSet::SetIndexString (std::string token, int index)
+{
+	_strings[index] = token;
+	_tokens[token] = index;
+}

data/ext/tokenset.h ADDED

@@ -0,0 +1,73 @@
+#if !defined tokenset_h
+#define tokenset_h
+/**
+ * This file is part of uhferret.
+ *
+ * Author::    Peter Lane
+ * Copyright:: Copyright 2011, Peter Lane.
+ * License::   GPLv3
+ *
+ * uhferret is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * uhferret is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <assert.h>
+#include <map>
+#include <string>
+#include <vector>
+/** A Token is a sequence of characters read in by a TokenReader
+  * -- this class provides a dynamic storage for the token supporting
+  *    addition of characters
+  * -- when finished, the token can be queried for its length and made into a string
+*/
+class Token
+{
+	public:
+		Token ();
+		~Token ();
+		void Erase ();
+		void AddChar (char c);
+    std::string GetString () const;
+		int GetLength () const;
+	private:
+		void Grow ();
+		char * _token; // storage for the token
+		int	_capacity; // size of the stored token
+		int 	_top;	// pointer to end of token
+};
+/** A TokenSet maps strings to token indices
+  * -- this is for memory efficiency, ensuring every token's string is
+  *    stored once within the application
+  */
+class TokenSet
+{
+	public:
+		TokenSet ();
+		std::size_t GetIndexFor (std::string token);
+		std::string GetStringFor (std::size_t token);
+		void Clear ();
+		void SetNextIndex (int index);
+		void SetIndexString (std::string token, int index);
+	private:
+		std::map<std::string, std::size_t> _tokens;
+		std::map<std::string, std::size_t>::const_iterator _tokens_it;
+		std::size_t _nextindex; // next free index for new string
+		std::map<std::size_t, std::string> _strings;
+		std::map<std::size_t, std::string>::const_iterator _strings_it;
+};
+#endif

data/ext/tupleset.cpp ADDED

@@ -0,0 +1,150 @@
+#include "tupleset.h"
+/**
+ * This file is part of uhferret.
+ *
+ * Author::    Peter Lane
+ * Copyright:: Copyright 2011, Peter Lane.
+ * License::   GPLv3
+ *
+ * uhferret is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * uhferret is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+ */
+TupleSet::TupleSet ()
+{}
+void TupleSet::Clear ()
+{
+	_tuple_map.clear ();
+}
+int TupleSet::Size ()
+{
+	int trigram_count = 0;
+	for (Begin (); HasMore (); GetNext ())
+	{
+		trigram_count++;
+	}
+	return trigram_count;
+}
+std::vector<int> & TupleSet::GetDocumentsForTuple (std::size_t token_0, std::size_t token_1, std::size_t token_2)
+{
+	return _tuple_map[token_0][token_1][token_2];
+}
+bool TupleSet::AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2, int document)
+{
+	bool has_doc = false;
+	std::vector<int> & fvector = _tuple_map[token_0][token_1][token_2];
+	// check if document is already in the trigram
+	for (int i = 0, n = fvector.size(); i < n; ++i)
+	{
+		if (fvector[i] == document)
+		{
+			has_doc = true;
+			break;
+		}
+	}
+	if (!has_doc) // didn't have document, so add it
+	{
+		fvector.push_back (document);
+		return true;  // indicate that document added
+	}
+	return false;
+}
+bool TupleSet::IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2)
+{
+	std::vector<int> fvector = GetDocumentsForTuple (t0, t1, t2);
+	bool has_doc1 = false;
+	bool has_doc2 = false;
+	for (int i=0, n=fvector.size(); i<n; ++i)
+	{
+		if (fvector[i] == doc1) has_doc1 = true;
+		if (fvector[i] == doc2) has_doc2 = true;
+	}
+	return ( has_doc1 && has_doc2 );
+}
+std::vector<std::string> TupleSet::CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset)
+{
+  std::vector<std::string> tuples;
+	for (Begin (); HasMore (); GetNext ())
+	{
+		if (IsMatchingTuple (GetToken (0), GetToken (1), GetToken (2),
+					doc1, doc2))
+		{
+			tuples.push_back (GetStringForCurrentTuple (tokenset));
+		}
+	}
+	return tuples;
+}
+void TupleSet::Begin ()
+{
+	_ti = _tuple_map.begin ();
+	_pi = (_ti->second).begin ();
+	_wi = (_pi->second).begin ();
+}
+void TupleSet::GetNext ()
+{
+	_wi++; // move to next word position
+	if (_wi == (_pi->second).end ())  // if words have finished, then move to next pair position
+	{
+		_pi++;
+		if (_pi == (_ti->second).end ()) // if pairs have finished, then move to next triple position
+		{
+			_ti++;
+			if (_ti == _tuple_map.end ()) return; // finished
+			_pi = (_ti->second).begin ();  // get next pair iterator
+		}
+		_wi = (_pi->second).begin ();  // get next word iterator
+	}
+}
+bool TupleSet::HasMore () const
+{
+	return _ti != _tuple_map.end ();
+}
+std::vector<int> & TupleSet::GetDocumentsForCurrentTuple ()
+{
+	return GetDocumentsForTuple (_ti->first, _pi->first, _wi->first);
+}
+std::string TupleSet::GetStringForCurrentTuple (TokenSet & tokenset) const
+{
+  std::string tuple = "";
+	tuple += tokenset.GetStringFor (_ti->first);
+	tuple += " " + tokenset.GetStringFor (_pi->first);
+	tuple += " " + tokenset.GetStringFor (_wi->first);
+	return tuple;
+}
+std::size_t TupleSet::GetToken (int i) const
+{
+	assert (i>=0 && i<=2);
+	if (i == 0)
+		return _ti->first;
+	else if (i == 1)
+		return _pi->first;
+	else // if (i == 2)
+		return _wi->first;
+}

data/ext/tupleset.h ADDED

@@ -0,0 +1,92 @@
+#if !defined tupleset_h
+#define tupleset_h
+/**
+ * This file is part of uhferret.
+ * Initial triple map idea by Bob Dickerson.
+ *
+ * Author::    Peter Lane
+ * Copyright:: Copyright 2011, Peter Lane.
+ * License::   GPLv3
+ *
+ * uhferret is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * uhferret is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with uhferret.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <assert.h>
+#include <map>
+#include <string>
+#include <vector>
+#include "tokenset.h"
+/** TupleSet maintains the database mapping trigrams to identifier of documents which contain them.
+  * The mapping is held as a sequence of std::maps, each map taking a std::size_t reference to
+  * a token as a key.  The end result of the three maps is a vector of document identifiers.
+  *
+  * The most important feature of the TupleSet is the collection of methods for iterating over
+  * all tuples in the TupleSet.
+  * e.g. with the definition:  TupleSet tuple_set;
+  * use:                       for (tuple_set.Begin (); tuple_set.HasMore (); tuple_set.GetNext ())
+  *                            {}
+  * to iterate over all the tuples.  The methods: GetDocumentsForCurrentTuple, GetStringForCurrentTuple,
+  * and GetToken0, GetToken1, GetToken2 return information on the current tuple.
+  */
+class TupleSet
+{
+	// typedef's to simplify declarations
+	typedef std::map<std::size_t, std::vector<int> > WordMap;
+	typedef WordMap::const_iterator WordMapIter;
+	typedef std::map<std::size_t, WordMap> PairMap;
+	typedef PairMap::const_iterator PairMapIter;
+	typedef std::map<std::size_t, PairMap> TripMap;
+	typedef TripMap::const_iterator TripMapIter;
+	public:
+		TupleSet ();
+		void Clear ();
+		int Size ();
+		// given a tuple, return the list of documents which contain that tuple
+		std::vector<int> & GetDocumentsForTuple (std::size_t token_0,
+				std::size_t token_1, std::size_t token_2);
+		// given a tuple and a document identifier,
+		// - make sure that the document is in the list for that tuple
+		// - returns true if the document was not already in trigram's list
+		bool AddDocument (std::size_t token_0, std::size_t token_1, std::size_t token_2,
+				int document);
+		// check if two documents share the given tuple
+		bool IsMatchingTuple (std::size_t t0, std::size_t t1, std::size_t t2, int doc1, int doc2);
+		// collect and return all tuples in the two given documents
+    std::vector<std::string> CollectMatchingTuples (int doc1, int doc2, TokenSet & tokenset);
+	private:
+		TripMap	_tuple_map;
+	public: // following methods and data structures are to handle an iterator on tupleset
+		void Begin ();			// start the iterator
+		void GetNext ();		// advance the iterator
+		bool HasMore () const;		// check for end
+		// retrieve current tuple's documents
+		std::vector<int> & GetDocumentsForCurrentTuple ();
+		// retrieve string for current tuple
+    std::string GetStringForCurrentTuple (TokenSet & tokenset) const;
+		// retrieve identifiers for individual tokens
+		std::size_t GetToken (int i) const;
+	private:
+		TripMapIter	_ti;	// iterator from first token to pairs
+		PairMapIter	_pi;	// iterator from second token to words
+		WordMapIter	_wi;	// iterator from third token to document list
+};
+#endif