uhferret 1.3.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/COPYING.txt +674 -0
- data/README.rdoc +79 -0
- data/bin/uhferret +129 -0
- data/bin/uhferret-server +68 -0
- data/ext/document.cpp +231 -0
- data/ext/document.h +89 -0
- data/ext/documentlist.cpp +229 -0
- data/ext/documentlist.h +80 -0
- data/ext/extconf.rb +2 -0
- data/ext/tokenreader.cpp +196 -0
- data/ext/tokenreader.h +85 -0
- data/ext/tokenset.cpp +111 -0
- data/ext/tokenset.h +73 -0
- data/ext/tupleset.cpp +150 -0
- data/ext/tupleset.h +92 -0
- data/ext/uhferret_lib_wrap.cxx +10726 -0
- data/lib/uhferret.rb +441 -0
- data/lib/utils.rb +93 -0
- data/lib/webferret.rb +246 -0
- metadata +71 -0
data/README.rdoc
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
= UHFerret
|
2
|
+
|
3
|
+
homepage:: https://peterlane.netlify.org/ferret/
|
4
|
+
source:: https://notabug.org/peterlane/uhferret-gem/releases
|
5
|
+
|
6
|
+
== Description
|
7
|
+
|
8
|
+
UHFerret is a copy-detection tool, supporting the analysis of large sets of
|
9
|
+
documents to find pairs of documents with substantial amounts of lexical
|
10
|
+
copying. Documents containing either natural language (e.g. English) or
|
11
|
+
computer programs (in C-family) may be processed.
|
12
|
+
|
13
|
+
This library provides a Ruby wrapper around uhferret suitable for
|
14
|
+
scripting, a command-line executable, 'uhferret', and a simple
|
15
|
+
server version, 'uhferret-server'.
|
16
|
+
|
17
|
+
NB: to install uhferret, Ruby must be able to compile and build C extensions.
|
18
|
+
|
19
|
+
== Use
|
20
|
+
|
21
|
+
=== Command Line
|
22
|
+
|
23
|
+
Usage: uhferret [options] file1 file2 ...
|
24
|
+
-h, --help help message
|
25
|
+
-c, --code process documents as code
|
26
|
+
-t, --text process documents as text (default)
|
27
|
+
-d, --data-table output similarity table (default)
|
28
|
+
-l, --list-trigrams output trigram list
|
29
|
+
-a, --all-comparisons output list of all comparisons
|
30
|
+
-x, --xml-report FILE generate xml report from two documents
|
31
|
+
-f, --definition-file FILE read document names from file
|
32
|
+
|
33
|
+
To compute the similarities of a set of files, use:
|
34
|
+
|
35
|
+
$ uhferret file1.txt file2.txt ...
|
36
|
+
|
37
|
+
An xml output can be generated for a pair of files using:
|
38
|
+
|
39
|
+
$ uhferret -x outfile.xml file1.txt file2.txt
|
40
|
+
|
41
|
+
The xml output can be displayed in a browser using the style sheet
|
42
|
+
'uhferret.xsl' in the examples folder, and then printed from the browser.
|
43
|
+
|
44
|
+
=== Program
|
45
|
+
|
46
|
+
Ferret can also be used as a library, and called from within a program.
|
47
|
+
For example:
|
48
|
+
|
49
|
+
ferret = Ferret.new
|
50
|
+
ferret.add 'filename1.txt'
|
51
|
+
ferret.add 'filename2.txt'
|
52
|
+
ferret.run
|
53
|
+
ferret.output_similarity_table
|
54
|
+
|
55
|
+
Will create a new instance of Ferret, add two documents, run and then output the
|
56
|
+
similarity between the two.
|
57
|
+
|
58
|
+
=== Server
|
59
|
+
|
60
|
+
Usage: uhferret-server [options]
|
61
|
+
-h, --help help message
|
62
|
+
-p, --port n port number
|
63
|
+
-f, --folder FOLDER base folder
|
64
|
+
|
65
|
+
The folder to store the processed files will default to
|
66
|
+
'FerretFiles' and the port to 2000.
|
67
|
+
Initial address: http://localhost:2000/ferret/home
|
68
|
+
|
69
|
+
NB: The server uses some \*nix commands, and so currently does not work
|
70
|
+
under Windows.
|
71
|
+
|
72
|
+
== Acknowledgements
|
73
|
+
|
74
|
+
UHFerret has been developed at the University of Hertfordshire by members of
|
75
|
+
the Plagiarism Detection Group. The original concept of using trigrams for
|
76
|
+
measuring copying was developed by Caroline Lyon and James Malcolm. JunPeng
|
77
|
+
Bao, Ruth Barrett and Bob Dickerson also contributed to the development of
|
78
|
+
earlier versions of Ferret.
|
79
|
+
|
data/bin/uhferret
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
|
3
|
+
# This file is part of uhferret, providing a command-line interface.
|
4
|
+
#
|
5
|
+
# Author:: Peter Lane
|
6
|
+
# Copyright:: Copyright 2012-20, Peter Lane.
|
7
|
+
# License:: GPLv3
|
8
|
+
#
|
9
|
+
# uhferret is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# uhferret is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with uhferret. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
|
22
|
+
require "optparse"
|
23
|
+
require "uhferret"
|
24
|
+
|
25
|
+
MAX_TABLE_SIZE = 100
|
26
|
+
VERSION = "1.3.7"
|
27
|
+
|
28
|
+
# ------------------------------------------------------------------
|
29
|
+
#
|
30
|
+
document_type = UHFerret::TextDocument
|
31
|
+
input_format = :from_argv
|
32
|
+
output_format = :similarity_table
|
33
|
+
definitions_file = nil
|
34
|
+
output_full_path = false
|
35
|
+
xml_output_file = ""
|
36
|
+
|
37
|
+
options = OptionParser.new do |opts|
|
38
|
+
opts.banner = "Usage: uhferret [options] file1 file2 ..."
|
39
|
+
opts.on("-h", "--help", "help message") do |v|
|
40
|
+
puts options
|
41
|
+
exit!
|
42
|
+
end
|
43
|
+
opts.on("-v", "--version", "version") do |v|
|
44
|
+
puts "uhferret: version #{VERSION}"
|
45
|
+
exit!
|
46
|
+
end
|
47
|
+
# -- document type
|
48
|
+
opts.on("-c", "--code", "process documents as code") do |v|
|
49
|
+
document_type = UHFerret::CodeDocument
|
50
|
+
end
|
51
|
+
opts.on("-t", "--text", "process documents as text (default)") do |v|
|
52
|
+
document_type = UHFerret::TextDocument
|
53
|
+
end
|
54
|
+
# -- output format
|
55
|
+
opts.on("-p", "--full-path", "output full path") do |v|
|
56
|
+
output_full_path = true
|
57
|
+
end
|
58
|
+
opts.on("-d", "--data-table", "output similarity table (default)") do |v|
|
59
|
+
output_format = :similarity_table
|
60
|
+
end
|
61
|
+
opts.on("-w", "--html-data-table", "output similarity table in html format") do |v|
|
62
|
+
output_format = :html_similarity_table
|
63
|
+
end
|
64
|
+
opts.on("-l", "--list-trigrams", "output trigram list") do |v|
|
65
|
+
output_format = :trigram_list
|
66
|
+
end
|
67
|
+
opts.on("-a", "--all-comparisons", "output list of all comparisons") do |v|
|
68
|
+
output_format = :all_comparisons
|
69
|
+
end
|
70
|
+
opts.on("-x", "--xml-report OUTPUT_FILE", "generate xml report from two documents") do |file|
|
71
|
+
output_format = :xml_output
|
72
|
+
xml_output_file = file
|
73
|
+
end
|
74
|
+
# -- file source
|
75
|
+
opts.on("-f FILE", "--definition-file FILE", "read document names from file") do |file|
|
76
|
+
input_format = :from_file
|
77
|
+
definitions_file = file
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
begin
|
82
|
+
# -- process input options
|
83
|
+
options.parse!
|
84
|
+
|
85
|
+
# -- check some errors
|
86
|
+
if output_format == :xml_output
|
87
|
+
unless ARGV.size == 2
|
88
|
+
puts "Error: for xml report, only provide two input filenames"
|
89
|
+
raise ArgumentError.new
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# -- add readable files, and run
|
94
|
+
ferret = UHFerret::Ferret.new
|
95
|
+
unless definitions_file.nil?
|
96
|
+
if File.readable? definitions_file
|
97
|
+
ferret.add_list_from_file(definitions_file, document_type)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
ARGV.each do |filename|
|
101
|
+
if File.readable? filename
|
102
|
+
ferret.add(filename, document_type)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
if ferret.size < 2
|
106
|
+
puts "Error: not enough valid filenames"
|
107
|
+
raise ArgumentError.new
|
108
|
+
end
|
109
|
+
ferret.run
|
110
|
+
|
111
|
+
# -- display output
|
112
|
+
case output_format
|
113
|
+
when :similarity_table
|
114
|
+
ferret.output_similarity_table output_full_path
|
115
|
+
when :html_similarity_table
|
116
|
+
ferret.output_html_similarity_table
|
117
|
+
when :trigram_list
|
118
|
+
ferret.output_trigram_list
|
119
|
+
when :all_comparisons
|
120
|
+
ferret.output_all_comparisons
|
121
|
+
when :xml_output
|
122
|
+
ferret.xml_output(xml_output_file, 0, 1)
|
123
|
+
end
|
124
|
+
|
125
|
+
rescue Exception => err
|
126
|
+
puts err
|
127
|
+
puts options
|
128
|
+
end
|
129
|
+
|
data/bin/uhferret-server
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of uhferret, providing a web service interface.
|
4
|
+
#
|
5
|
+
# Author:: Peter Lane
|
6
|
+
# Copyright:: Copyright 2012, Peter Lane.
|
7
|
+
# License:: GPLv3
|
8
|
+
#
|
9
|
+
# uhferret is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# uhferret is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with uhferret. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
|
22
|
+
require "optparse"
|
23
|
+
require "webferret"
|
24
|
+
|
25
|
+
# Name of the ferret software
|
26
|
+
FERRET = "uhferret"
|
27
|
+
# Base folder for working with files
|
28
|
+
$base = "#{Dir.pwd}/FerretFiles/"
|
29
|
+
# Port number for server
|
30
|
+
port = 2000
|
31
|
+
|
32
|
+
options = OptionParser.new do |opts|
|
33
|
+
opts.banner = "Usage: uhferret-server [options]"
|
34
|
+
opts.on("-h", "--help", "help message") do |v|
|
35
|
+
puts options
|
36
|
+
exit!
|
37
|
+
end
|
38
|
+
# -- port number
|
39
|
+
opts.on("-p", "--port n", Integer, "port number") do |v|
|
40
|
+
port = v
|
41
|
+
end
|
42
|
+
# -- base folder
|
43
|
+
opts.on("-f", "--folder FOLDER", "base folder") do |v|
|
44
|
+
$base = File.expand_path v
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
begin
|
49
|
+
options.parse!
|
50
|
+
|
51
|
+
# Refuse to work if the folder already exists
|
52
|
+
if File.exists? $base
|
53
|
+
puts "Folder #{$base} already exists"
|
54
|
+
puts "Please empty the folder before starting the ferret server"
|
55
|
+
exit!
|
56
|
+
end
|
57
|
+
|
58
|
+
# install the ferret server at given Port number
|
59
|
+
ferret = HTTPServer.new(:Port => port, :DocumentRoot => "/")
|
60
|
+
|
61
|
+
ferret.mount "/ferret/home", UHFerret::FerretHomeServlet
|
62
|
+
ferret.mount "/ferret/report", UHFerret::FerretReportServlet
|
63
|
+
|
64
|
+
trap("INT") { ferret.shutdown }
|
65
|
+
|
66
|
+
ferret.start
|
67
|
+
end
|
68
|
+
|
data/ext/document.cpp
ADDED
@@ -0,0 +1,231 @@
|
|
1
|
+
#include "document.h"
|
2
|
+
|
3
|
+
/**
|
4
|
+
* This file is part of uhferret.
|
5
|
+
*
|
6
|
+
* Author:: Peter Lane
|
7
|
+
* Copyright:: Copyright 2011, Peter Lane.
|
8
|
+
* License:: GPLv3
|
9
|
+
*
|
10
|
+
* uhferret is free software: you can redistribute it and/or modify
|
11
|
+
* it under the terms of the GNU General Public License as published by
|
12
|
+
* the Free Software Foundation, either version 3 of the License, or
|
13
|
+
* (at your option) any later version.
|
14
|
+
*
|
15
|
+
* uhferret is distributed in the hope that it will be useful,
|
16
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
17
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
18
|
+
* GNU General Public License for more details.
|
19
|
+
*
|
20
|
+
* You should have received a copy of the GNU General Public License
|
21
|
+
* along with uhferret. If not, see <http://www.gnu.org/licenses/>.
|
22
|
+
*/
|
23
|
+
|
24
|
+
Document::Document (std::string pathname, DocumentType type, int id)
|
25
|
+
: _pathname (pathname),
|
26
|
+
_type (type),
|
27
|
+
_num_trigrams (0),
|
28
|
+
_group_id (id)
|
29
|
+
{}
|
30
|
+
|
31
|
+
Document::Document (Document * document)
|
32
|
+
: _pathname (document->_pathname),
|
33
|
+
_type (document->_type),
|
34
|
+
_num_trigrams (0),
|
35
|
+
_group_id (document->_group_id)
|
36
|
+
{}
|
37
|
+
|
38
|
+
void Document::SetType (DocumentType type)
|
39
|
+
{
|
40
|
+
_type = type;
|
41
|
+
}
|
42
|
+
|
43
|
+
bool Document::IsTextType () const
|
44
|
+
{
|
45
|
+
return (_type == TypeText);
|
46
|
+
}
|
47
|
+
|
48
|
+
std::string Document::GetPathname () const
|
49
|
+
{
|
50
|
+
return _pathname;
|
51
|
+
}
|
52
|
+
|
53
|
+
void Document::SetPathname (std::string pathname)
|
54
|
+
{
|
55
|
+
_pathname = pathname;
|
56
|
+
}
|
57
|
+
|
58
|
+
int Document::GetGroupId () const
|
59
|
+
{
|
60
|
+
return _group_id;
|
61
|
+
}
|
62
|
+
|
63
|
+
void Document::SetGroupId (int id)
|
64
|
+
{
|
65
|
+
_group_id = id;
|
66
|
+
}
|
67
|
+
|
68
|
+
int Document::GetTrigramCount () const
|
69
|
+
{
|
70
|
+
return _num_trigrams;
|
71
|
+
}
|
72
|
+
|
73
|
+
// WARNING: This method should only be used when document definitions are
|
74
|
+
// loaded from a file.
|
75
|
+
void Document::SetTrigramCount (int count)
|
76
|
+
{
|
77
|
+
_num_trigrams = count;
|
78
|
+
}
|
79
|
+
|
80
|
+
void Document::ResetTrigramCount ()
|
81
|
+
{
|
82
|
+
_num_trigrams = 0;
|
83
|
+
}
|
84
|
+
|
85
|
+
void Document::IncrementTrigramCount ()
|
86
|
+
{
|
87
|
+
_num_trigrams += 1;
|
88
|
+
}
|
89
|
+
|
90
|
+
// Start input from the file referred to by this document
|
91
|
+
bool Document::StartInput (TokenSet & tokenset)
|
92
|
+
{
|
93
|
+
_fb = new std::ifstream(GetPathname().c_str (), std::ifstream::in);
|
94
|
+
if (_fb->is_open ())
|
95
|
+
{
|
96
|
+
_cin = _fb;
|
97
|
+
InitialiseInput (tokenset);
|
98
|
+
return true; // signify file opened correctly
|
99
|
+
}
|
100
|
+
else
|
101
|
+
{
|
102
|
+
return false;
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
// Start input from a provided input stream
|
107
|
+
bool Document::StartInput (std::istream & input, TokenSet & tokenset)
|
108
|
+
{
|
109
|
+
_cin = &input;
|
110
|
+
InitialiseInput (tokenset);
|
111
|
+
return true;
|
112
|
+
}
|
113
|
+
|
114
|
+
// Start input by constructing a new Reader based on current document type
|
115
|
+
// TokenSet is provided by caller, so Reader uses common set of labels for tokens
|
116
|
+
void Document::InitialiseInput (TokenSet & tokenset)
|
117
|
+
{
|
118
|
+
if (_type == TypeText)
|
119
|
+
{
|
120
|
+
_token_input = new WordReader (* _cin);
|
121
|
+
}
|
122
|
+
else // (_type == typeCode)
|
123
|
+
{
|
124
|
+
_token_input = new CCodeReader (* _cin);
|
125
|
+
}
|
126
|
+
ReadTrigram (tokenset); // read first two tokens so next call to
|
127
|
+
ReadTrigram (tokenset); // ReadTrigram returns the first complete trigram
|
128
|
+
}
|
129
|
+
|
130
|
+
// returns true if this document's filetype is the same as the given extension
|
131
|
+
// -- note, case is ignored, so "txt" == "TXT" == "tXt"
|
132
|
+
bool Document::IsFileType (std::string extension) const
|
133
|
+
{
|
134
|
+
int dot_posn = _pathname.find_last_of ('.', true); // search for last dot, i.e. from end
|
135
|
+
if (dot_posn == _pathname.npos) return false;
|
136
|
+
std::string file_extension = _pathname.substr (dot_posn+1, _pathname.npos-(dot_posn+1));
|
137
|
+
|
138
|
+
return StringToUpper(file_extension) == StringToUpper(extension); // ignore case in comparison
|
139
|
+
}
|
140
|
+
|
141
|
+
std::string Document::StringToUpper (std::string str) const
|
142
|
+
{
|
143
|
+
std::string nstr = "";
|
144
|
+
|
145
|
+
for(int i=0, l = str.length (); i < l; i += 1)
|
146
|
+
{
|
147
|
+
nstr += std::toupper(str[i]);
|
148
|
+
}
|
149
|
+
return nstr;
|
150
|
+
}
|
151
|
+
|
152
|
+
// Test if file extension represents a c-type language
|
153
|
+
bool Document::IsCodeType () const
|
154
|
+
{
|
155
|
+
return IsFileType ("cpp") ||
|
156
|
+
IsFileType ("c") ||
|
157
|
+
IsFileType ("java") ||
|
158
|
+
IsFileType ("h");
|
159
|
+
}
|
160
|
+
|
161
|
+
// Test if file extension represents a pure text document
|
162
|
+
bool Document::IsTxtType () const
|
163
|
+
{
|
164
|
+
return IsFileType ("txt");
|
165
|
+
}
|
166
|
+
|
167
|
+
// Test if file is not a known type
|
168
|
+
bool Document::IsUnknownType () const
|
169
|
+
{
|
170
|
+
return ! (IsCodeType () || IsTxtType ());
|
171
|
+
}
|
172
|
+
|
173
|
+
// Reads next input token and updates information held on current trigram.
|
174
|
+
// return true if a trigram has been read and is ready for retrieval
|
175
|
+
bool Document::ReadTrigram (TokenSet & tokenset)
|
176
|
+
{
|
177
|
+
_current_tuple[0] = _current_tuple[1];
|
178
|
+
_current_tuple[1] = _current_tuple[2];
|
179
|
+
_current_start[0] = _current_start[1];
|
180
|
+
_current_start[1] = _current_start[2];
|
181
|
+
if ( _token_input->ReadToken () )
|
182
|
+
{
|
183
|
+
_current_tuple[2] = _token_input->GetToken (tokenset);
|
184
|
+
_current_start[2] = _token_input->GetTokenStart ();
|
185
|
+
return true;
|
186
|
+
}
|
187
|
+
else
|
188
|
+
{
|
189
|
+
return false;
|
190
|
+
}
|
191
|
+
|
192
|
+
}
|
193
|
+
|
194
|
+
// retrieve a token of the current tuple, based on position within tuple
|
195
|
+
// -- as we only deal with trigrams, index must be in [0,2]
|
196
|
+
std::size_t Document::GetToken (int i) const
|
197
|
+
{
|
198
|
+
assert (i>=0 && i<=2);
|
199
|
+
return _current_tuple[i];
|
200
|
+
}
|
201
|
+
|
202
|
+
// retrieve the start position of current trigram
|
203
|
+
std::size_t Document::GetTrigramStart () const
|
204
|
+
{
|
205
|
+
return _current_start[0];
|
206
|
+
}
|
207
|
+
|
208
|
+
// retrieve the start position of token i within current trigram
|
209
|
+
// -- used to get start position of second word
|
210
|
+
std::size_t Document::GetTrigramStart (int i) const
|
211
|
+
{
|
212
|
+
assert (i>=0 && i<=2);
|
213
|
+
return _current_start[i];
|
214
|
+
}
|
215
|
+
|
216
|
+
// retrieve the end position of the current token
|
217
|
+
std::size_t Document::GetTrigramEnd () const
|
218
|
+
{
|
219
|
+
return _token_input->GetTokenEnd ();
|
220
|
+
}
|
221
|
+
|
222
|
+
// Close up the input file buffer
|
223
|
+
void Document::CloseInput ()
|
224
|
+
{
|
225
|
+
delete _token_input;
|
226
|
+
if (_fb->is_open ())
|
227
|
+
{
|
228
|
+
_fb->close ();
|
229
|
+
}
|
230
|
+
}
|
231
|
+
|