uhferret 1.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/COPYING.txt +674 -0
- data/README.rdoc +79 -0
- data/bin/uhferret +129 -0
- data/bin/uhferret-server +68 -0
- data/ext/document.cpp +231 -0
- data/ext/document.h +89 -0
- data/ext/documentlist.cpp +229 -0
- data/ext/documentlist.h +80 -0
- data/ext/extconf.rb +2 -0
- data/ext/tokenreader.cpp +196 -0
- data/ext/tokenreader.h +85 -0
- data/ext/tokenset.cpp +111 -0
- data/ext/tokenset.h +73 -0
- data/ext/tupleset.cpp +150 -0
- data/ext/tupleset.h +92 -0
- data/ext/uhferret_lib_wrap.cxx +10726 -0
- data/lib/uhferret.rb +441 -0
- data/lib/utils.rb +93 -0
- data/lib/webferret.rb +246 -0
- metadata +71 -0
data/README.rdoc
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
= UHFerret
|
2
|
+
|
3
|
+
homepage:: https://peterlane.netlify.org/ferret/
|
4
|
+
source:: https://notabug.org/peterlane/uhferret-gem/releases
|
5
|
+
|
6
|
+
== Description
|
7
|
+
|
8
|
+
UHFerret is a copy-detection tool, supporting the analysis of large sets of
|
9
|
+
documents to find pairs of documents with substantial amounts of lexical
|
10
|
+
copying. Documents containing either natural language (e.g. English) or
|
11
|
+
computer programs (in C-family) may be processed.
|
12
|
+
|
13
|
+
This library provides a Ruby wrapper around uhferret suitable for
|
14
|
+
scripting, a command-line executable, 'uhferret', and a simple
|
15
|
+
server version, 'uhferret-server'.
|
16
|
+
|
17
|
+
NB: to install uhferret, Ruby must be able to compile and build C extensions.
|
18
|
+
|
19
|
+
== Use
|
20
|
+
|
21
|
+
=== Command Line
|
22
|
+
|
23
|
+
Usage: uhferret [options] file1 file2 ...
|
24
|
+
-h, --help help message
|
25
|
+
-c, --code process documents as code
|
26
|
+
-t, --text process documents as text (default)
|
27
|
+
-d, --data-table output similarity table (default)
|
28
|
+
-l, --list-trigrams output trigram list
|
29
|
+
-a, --all-comparisons output list of all comparisons
|
30
|
+
-x, --xml-report FILE generate xml report from two documents
|
31
|
+
-f, --definition-file FILE read document names from file
|
32
|
+
|
33
|
+
To compute the similarities of a set of files, use:
|
34
|
+
|
35
|
+
$ uhferret file1.txt file2.txt ...
|
36
|
+
|
37
|
+
An xml output can be generated for a pair of files using:
|
38
|
+
|
39
|
+
$ uhferret -x outfile.xml file1.txt file2.txt
|
40
|
+
|
41
|
+
The xml output can be displayed in a browser using the style sheet
|
42
|
+
'uhferret.xsl' in the examples folder, and then printed from the browser.
|
43
|
+
|
44
|
+
=== Program
|
45
|
+
|
46
|
+
Ferret can also be used as a library, and called from within a program.
|
47
|
+
For example:
|
48
|
+
|
49
|
+
ferret = Ferret.new
|
50
|
+
ferret.add 'filename1.txt'
|
51
|
+
ferret.add 'filename2.txt'
|
52
|
+
ferret.run
|
53
|
+
ferret.output_similarity_table
|
54
|
+
|
55
|
+
Will create a new instance of Ferret, add two documents, run and then output the
|
56
|
+
similarity between the two.
|
57
|
+
|
58
|
+
=== Server
|
59
|
+
|
60
|
+
Usage: uhferret-server [options]
|
61
|
+
-h, --help help message
|
62
|
+
-p, --port n port number
|
63
|
+
-f, --folder FOLDER base folder
|
64
|
+
|
65
|
+
The folder to store the processed files will default to
|
66
|
+
'FerretFiles' and the port to 2000.
|
67
|
+
Initial address: http://localhost:2000/ferret/home
|
68
|
+
|
69
|
+
NB: The server uses some \*nix commands, and so currently does not work
|
70
|
+
under Windows.
|
71
|
+
|
72
|
+
== Acknowledgements
|
73
|
+
|
74
|
+
UHFerret has been developed at the University of Hertfordshire by members of
|
75
|
+
the Plagiarism Detection Group. The original concept of using trigrams for
|
76
|
+
measuring copying was developed by Caroline Lyon and James Malcolm. JunPeng
|
77
|
+
Bao, Ruth Barrett and Bob Dickerson also contributed to the development of
|
78
|
+
earlier versions of Ferret.
|
79
|
+
|
data/bin/uhferret
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
# This file is part of uhferret, providing a command-line interface.
|
4
|
+
#
|
5
|
+
# Author:: Peter Lane
|
6
|
+
# Copyright:: Copyright 2012-20, Peter Lane.
|
7
|
+
# License:: GPLv3
|
8
|
+
#
|
9
|
+
# uhferret is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# uhferret is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with uhferret. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
|
22
|
+
require "optparse"
|
23
|
+
require "uhferret"
|
24
|
+
|
25
|
+
MAX_TABLE_SIZE = 100
|
26
|
+
VERSION = "1.3.7"
|
27
|
+
|
28
|
+
# ------------------------------------------------------------------
|
29
|
+
#
|
30
|
+
document_type = UHFerret::TextDocument
|
31
|
+
input_format = :from_argv
|
32
|
+
output_format = :similarity_table
|
33
|
+
definitions_file = nil
|
34
|
+
output_full_path = false
|
35
|
+
xml_output_file = ""
|
36
|
+
|
37
|
+
options = OptionParser.new do |opts|
|
38
|
+
opts.banner = "Usage: uhferret [options] file1 file2 ..."
|
39
|
+
opts.on("-h", "--help", "help message") do |v|
|
40
|
+
puts options
|
41
|
+
exit!
|
42
|
+
end
|
43
|
+
opts.on("-v", "--version", "version") do |v|
|
44
|
+
puts "uhferret: version #{VERSION}"
|
45
|
+
exit!
|
46
|
+
end
|
47
|
+
# -- document type
|
48
|
+
opts.on("-c", "--code", "process documents as code") do |v|
|
49
|
+
document_type = UHFerret::CodeDocument
|
50
|
+
end
|
51
|
+
opts.on("-t", "--text", "process documents as text (default)") do |v|
|
52
|
+
document_type = UHFerret::TextDocument
|
53
|
+
end
|
54
|
+
# -- output format
|
55
|
+
opts.on("-p", "--full-path", "output full path") do |v|
|
56
|
+
output_full_path = true
|
57
|
+
end
|
58
|
+
opts.on("-d", "--data-table", "output similarity table (default)") do |v|
|
59
|
+
output_format = :similarity_table
|
60
|
+
end
|
61
|
+
opts.on("-w", "--html-data-table", "output similarity table in html format") do |v|
|
62
|
+
output_format = :html_similarity_table
|
63
|
+
end
|
64
|
+
opts.on("-l", "--list-trigrams", "output trigram list") do |v|
|
65
|
+
output_format = :trigram_list
|
66
|
+
end
|
67
|
+
opts.on("-a", "--all-comparisons", "output list of all comparisons") do |v|
|
68
|
+
output_format = :all_comparisons
|
69
|
+
end
|
70
|
+
opts.on("-x", "--xml-report OUTPUT_FILE", "generate xml report from two documents") do |file|
|
71
|
+
output_format = :xml_output
|
72
|
+
xml_output_file = file
|
73
|
+
end
|
74
|
+
# -- file source
|
75
|
+
opts.on("-f FILE", "--definition-file FILE", "read document names from file") do |file|
|
76
|
+
input_format = :from_file
|
77
|
+
definitions_file = file
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
begin
|
82
|
+
# -- process input options
|
83
|
+
options.parse!
|
84
|
+
|
85
|
+
# -- check some errors
|
86
|
+
if output_format == :xml_output
|
87
|
+
unless ARGV.size == 2
|
88
|
+
puts "Error: for xml report, only provide two input filenames"
|
89
|
+
raise ArgumentError.new
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# -- add readable files, and run
|
94
|
+
ferret = UHFerret::Ferret.new
|
95
|
+
unless definitions_file.nil?
|
96
|
+
if File.readable? definitions_file
|
97
|
+
ferret.add_list_from_file(definitions_file, document_type)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
ARGV.each do |filename|
|
101
|
+
if File.readable? filename
|
102
|
+
ferret.add(filename, document_type)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
if ferret.size < 2
|
106
|
+
puts "Error: not enough valid filenames"
|
107
|
+
raise ArgumentError.new
|
108
|
+
end
|
109
|
+
ferret.run
|
110
|
+
|
111
|
+
# -- display output
|
112
|
+
case output_format
|
113
|
+
when :similarity_table
|
114
|
+
ferret.output_similarity_table output_full_path
|
115
|
+
when :html_similarity_table
|
116
|
+
ferret.output_html_similarity_table
|
117
|
+
when :trigram_list
|
118
|
+
ferret.output_trigram_list
|
119
|
+
when :all_comparisons
|
120
|
+
ferret.output_all_comparisons
|
121
|
+
when :xml_output
|
122
|
+
ferret.xml_output(xml_output_file, 0, 1)
|
123
|
+
end
|
124
|
+
|
125
|
+
rescue Exception => err
|
126
|
+
puts err
|
127
|
+
puts options
|
128
|
+
end
|
129
|
+
|
data/bin/uhferret-server
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of uhferret, providing a web service interface.
|
4
|
+
#
|
5
|
+
# Author:: Peter Lane
|
6
|
+
# Copyright:: Copyright 2012, Peter Lane.
|
7
|
+
# License:: GPLv3
|
8
|
+
#
|
9
|
+
# uhferret is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# uhferret is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with uhferret. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
|
22
|
+
require "optparse"
|
23
|
+
require "webferret"
|
24
|
+
|
25
|
+
# Name of the ferret software
|
26
|
+
FERRET = "uhferret"
|
27
|
+
# Base folder for working with files
|
28
|
+
$base = "#{Dir.pwd}/FerretFiles/"
|
29
|
+
# Port number for server
|
30
|
+
port = 2000
|
31
|
+
|
32
|
+
options = OptionParser.new do |opts|
|
33
|
+
opts.banner = "Usage: uhferret-server [options]"
|
34
|
+
opts.on("-h", "--help", "help message") do |v|
|
35
|
+
puts options
|
36
|
+
exit!
|
37
|
+
end
|
38
|
+
# -- port number
|
39
|
+
opts.on("-p", "--port n", Integer, "port number") do |v|
|
40
|
+
port = v
|
41
|
+
end
|
42
|
+
# -- base folder
|
43
|
+
opts.on("-f", "--folder FOLDER", "base folder") do |v|
|
44
|
+
$base = File.expand_path v
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
begin
|
49
|
+
options.parse!
|
50
|
+
|
51
|
+
# Refuse to work if the folder already exists
|
52
|
+
if File.exists? $base
|
53
|
+
puts "Folder #{$base} already exists"
|
54
|
+
puts "Please empty the folder before starting the ferret server"
|
55
|
+
exit!
|
56
|
+
end
|
57
|
+
|
58
|
+
# install the ferret server at given Port number
|
59
|
+
ferret = HTTPServer.new(:Port => port, :DocumentRoot => "/")
|
60
|
+
|
61
|
+
ferret.mount "/ferret/home", UHFerret::FerretHomeServlet
|
62
|
+
ferret.mount "/ferret/report", UHFerret::FerretReportServlet
|
63
|
+
|
64
|
+
trap("INT") { ferret.shutdown }
|
65
|
+
|
66
|
+
ferret.start
|
67
|
+
end
|
68
|
+
|
data/ext/document.cpp
ADDED
@@ -0,0 +1,231 @@
|
|
1
|
+
#include "document.h"
|
2
|
+
|
3
|
+
/**
|
4
|
+
* This file is part of uhferret.
|
5
|
+
*
|
6
|
+
* Author:: Peter Lane
|
7
|
+
* Copyright:: Copyright 2011, Peter Lane.
|
8
|
+
* License:: GPLv3
|
9
|
+
*
|
10
|
+
* uhferret is free software: you can redistribute it and/or modify
|
11
|
+
* it under the terms of the GNU General Public License as published by
|
12
|
+
* the Free Software Foundation, either version 3 of the License, or
|
13
|
+
* (at your option) any later version.
|
14
|
+
*
|
15
|
+
* uhferret is distributed in the hope that it will be useful,
|
16
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
17
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
18
|
+
* GNU General Public License for more details.
|
19
|
+
*
|
20
|
+
* You should have received a copy of the GNU General Public License
|
21
|
+
* along with uhferret. If not, see <http://www.gnu.org/licenses/>.
|
22
|
+
*/
|
23
|
+
|
24
|
+
Document::Document (std::string pathname, DocumentType type, int id)
|
25
|
+
: _pathname (pathname),
|
26
|
+
_type (type),
|
27
|
+
_num_trigrams (0),
|
28
|
+
_group_id (id)
|
29
|
+
{}
|
30
|
+
|
31
|
+
Document::Document (Document * document)
|
32
|
+
: _pathname (document->_pathname),
|
33
|
+
_type (document->_type),
|
34
|
+
_num_trigrams (0),
|
35
|
+
_group_id (document->_group_id)
|
36
|
+
{}
|
37
|
+
|
38
|
+
void Document::SetType (DocumentType type)
|
39
|
+
{
|
40
|
+
_type = type;
|
41
|
+
}
|
42
|
+
|
43
|
+
bool Document::IsTextType () const
|
44
|
+
{
|
45
|
+
return (_type == TypeText);
|
46
|
+
}
|
47
|
+
|
48
|
+
std::string Document::GetPathname () const
|
49
|
+
{
|
50
|
+
return _pathname;
|
51
|
+
}
|
52
|
+
|
53
|
+
void Document::SetPathname (std::string pathname)
|
54
|
+
{
|
55
|
+
_pathname = pathname;
|
56
|
+
}
|
57
|
+
|
58
|
+
int Document::GetGroupId () const
|
59
|
+
{
|
60
|
+
return _group_id;
|
61
|
+
}
|
62
|
+
|
63
|
+
void Document::SetGroupId (int id)
|
64
|
+
{
|
65
|
+
_group_id = id;
|
66
|
+
}
|
67
|
+
|
68
|
+
int Document::GetTrigramCount () const
|
69
|
+
{
|
70
|
+
return _num_trigrams;
|
71
|
+
}
|
72
|
+
|
73
|
+
// WARNING: This method should only be used when document definitions are
|
74
|
+
// loaded from a file.
|
75
|
+
void Document::SetTrigramCount (int count)
|
76
|
+
{
|
77
|
+
_num_trigrams = count;
|
78
|
+
}
|
79
|
+
|
80
|
+
void Document::ResetTrigramCount ()
|
81
|
+
{
|
82
|
+
_num_trigrams = 0;
|
83
|
+
}
|
84
|
+
|
85
|
+
void Document::IncrementTrigramCount ()
|
86
|
+
{
|
87
|
+
_num_trigrams += 1;
|
88
|
+
}
|
89
|
+
|
90
|
+
// Start input from the file referred to by this document
|
91
|
+
bool Document::StartInput (TokenSet & tokenset)
|
92
|
+
{
|
93
|
+
_fb = new std::ifstream(GetPathname().c_str (), std::ifstream::in);
|
94
|
+
if (_fb->is_open ())
|
95
|
+
{
|
96
|
+
_cin = _fb;
|
97
|
+
InitialiseInput (tokenset);
|
98
|
+
return true; // signify file opened correctly
|
99
|
+
}
|
100
|
+
else
|
101
|
+
{
|
102
|
+
return false;
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
// Start input from a provided input stream
|
107
|
+
bool Document::StartInput (std::istream & input, TokenSet & tokenset)
|
108
|
+
{
|
109
|
+
_cin = &input;
|
110
|
+
InitialiseInput (tokenset);
|
111
|
+
return true;
|
112
|
+
}
|
113
|
+
|
114
|
+
// Start input by constructing a new Reader based on current document type
|
115
|
+
// TokenSet is provided by caller, so Reader uses common set of labels for tokens
|
116
|
+
void Document::InitialiseInput (TokenSet & tokenset)
|
117
|
+
{
|
118
|
+
if (_type == TypeText)
|
119
|
+
{
|
120
|
+
_token_input = new WordReader (* _cin);
|
121
|
+
}
|
122
|
+
else // (_type == typeCode)
|
123
|
+
{
|
124
|
+
_token_input = new CCodeReader (* _cin);
|
125
|
+
}
|
126
|
+
ReadTrigram (tokenset); // read first two tokens so next call to
|
127
|
+
ReadTrigram (tokenset); // ReadTrigram returns the first complete trigram
|
128
|
+
}
|
129
|
+
|
130
|
+
// returns true if this document's filetype is the same as the given extension
|
131
|
+
// -- note, case is ignored, so "txt" == "TXT" == "tXt"
|
132
|
+
bool Document::IsFileType (std::string extension) const
|
133
|
+
{
|
134
|
+
int dot_posn = _pathname.find_last_of ('.', true); // search for last dot, i.e. from end
|
135
|
+
if (dot_posn == _pathname.npos) return false;
|
136
|
+
std::string file_extension = _pathname.substr (dot_posn+1, _pathname.npos-(dot_posn+1));
|
137
|
+
|
138
|
+
return StringToUpper(file_extension) == StringToUpper(extension); // ignore case in comparison
|
139
|
+
}
|
140
|
+
|
141
|
+
std::string Document::StringToUpper (std::string str) const
|
142
|
+
{
|
143
|
+
std::string nstr = "";
|
144
|
+
|
145
|
+
for(int i=0, l = str.length (); i < l; i += 1)
|
146
|
+
{
|
147
|
+
nstr += std::toupper(str[i]);
|
148
|
+
}
|
149
|
+
return nstr;
|
150
|
+
}
|
151
|
+
|
152
|
+
// Test if file extension represents a c-type language
|
153
|
+
bool Document::IsCodeType () const
|
154
|
+
{
|
155
|
+
return IsFileType ("cpp") ||
|
156
|
+
IsFileType ("c") ||
|
157
|
+
IsFileType ("java") ||
|
158
|
+
IsFileType ("h");
|
159
|
+
}
|
160
|
+
|
161
|
+
// Test if file extension represents a pure text document
|
162
|
+
bool Document::IsTxtType () const
|
163
|
+
{
|
164
|
+
return IsFileType ("txt");
|
165
|
+
}
|
166
|
+
|
167
|
+
// Test if file is not a known type
|
168
|
+
bool Document::IsUnknownType () const
|
169
|
+
{
|
170
|
+
return ! (IsCodeType () || IsTxtType ());
|
171
|
+
}
|
172
|
+
|
173
|
+
// Reads next input token and updates information held on current trigram.
|
174
|
+
// return true if a trigram has been read and is ready for retrieval
|
175
|
+
bool Document::ReadTrigram (TokenSet & tokenset)
|
176
|
+
{
|
177
|
+
_current_tuple[0] = _current_tuple[1];
|
178
|
+
_current_tuple[1] = _current_tuple[2];
|
179
|
+
_current_start[0] = _current_start[1];
|
180
|
+
_current_start[1] = _current_start[2];
|
181
|
+
if ( _token_input->ReadToken () )
|
182
|
+
{
|
183
|
+
_current_tuple[2] = _token_input->GetToken (tokenset);
|
184
|
+
_current_start[2] = _token_input->GetTokenStart ();
|
185
|
+
return true;
|
186
|
+
}
|
187
|
+
else
|
188
|
+
{
|
189
|
+
return false;
|
190
|
+
}
|
191
|
+
|
192
|
+
}
|
193
|
+
|
194
|
+
// retrieve a token of the current tuple, based on position within tuple
|
195
|
+
// -- as we only deal with trigrams, index must be in [0,2]
|
196
|
+
std::size_t Document::GetToken (int i) const
|
197
|
+
{
|
198
|
+
assert (i>=0 && i<=2);
|
199
|
+
return _current_tuple[i];
|
200
|
+
}
|
201
|
+
|
202
|
+
// retrieve the start position of current trigram
|
203
|
+
std::size_t Document::GetTrigramStart () const
|
204
|
+
{
|
205
|
+
return _current_start[0];
|
206
|
+
}
|
207
|
+
|
208
|
+
// retrieve the start position of token i within current trigram
|
209
|
+
// -- used to get start position of second word
|
210
|
+
std::size_t Document::GetTrigramStart (int i) const
|
211
|
+
{
|
212
|
+
assert (i>=0 && i<=2);
|
213
|
+
return _current_start[i];
|
214
|
+
}
|
215
|
+
|
216
|
+
// retrieve the end position of the current token
|
217
|
+
std::size_t Document::GetTrigramEnd () const
|
218
|
+
{
|
219
|
+
return _token_input->GetTokenEnd ();
|
220
|
+
}
|
221
|
+
|
222
|
+
// Close up the input file buffer
|
223
|
+
void Document::CloseInput ()
|
224
|
+
{
|
225
|
+
delete _token_input;
|
226
|
+
if (_fb->is_open ())
|
227
|
+
{
|
228
|
+
_fb->close ();
|
229
|
+
}
|
230
|
+
}
|
231
|
+
|