uhferret 1.3.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,79 @@
1
+ = UHFerret
2
+
3
+ homepage:: https://peterlane.netlify.org/ferret/
4
+ source:: https://notabug.org/peterlane/uhferret-gem/releases
5
+
6
+ == Description
7
+
8
+ UHFerret is a copy-detection tool, supporting the analysis of large sets of
9
+ documents to find pairs of documents with substantial amounts of lexical
10
+ copying. Documents containing either natural language (e.g. English) or
11
+ computer programs (in C-family) may be processed.
12
+
13
+ This library provides a Ruby wrapper around uhferret suitable for
14
+ scripting, a command-line executable, 'uhferret', and a simple
15
+ server version, 'uhferret-server'.
16
+
17
+ NB: to install uhferret, Ruby must be able to compile and build C extensions.
18
+
19
+ == Use
20
+
21
+ === Command Line
22
+
23
+ Usage: uhferret [options] file1 file2 ...
24
+ -h, --help help message
25
+ -c, --code process documents as code
26
+ -t, --text process documents as text (default)
27
+ -d, --data-table output similarity table (default)
28
+ -l, --list-trigrams output trigram list
29
+ -a, --all-comparisons output list of all comparisons
30
+ -x, --xml-report FILE generate xml report from two documents
31
+ -f, --definition-file FILE read document names from file
32
+
33
+ To compute the similarities of a set of files, use:
34
+
35
+ $ uhferret file1.txt file2.txt ...
36
+
37
+ An xml output can be generated for a pair of files using:
38
+
39
+ $ uhferret -x outfile.xml file1.txt file2.txt
40
+
41
+ The xml output can be displayed in a browser using the style sheet
42
+ 'uhferret.xsl' in the examples folder, and then printed from the browser.
43
+
44
+ === Program
45
+
46
+ Ferret can also be used as a library, and called from within a program.
47
+ For example:
48
+
49
+ ferret = Ferret.new
50
+ ferret.add 'filename1.txt'
51
+ ferret.add 'filename2.txt'
52
+ ferret.run
53
+ ferret.output_similarity_table
54
+
55
+ Will create a new instance of Ferret, add two documents, run and then output the
56
+ similarity between the two.
57
+
58
+ === Server
59
+
60
+ Usage: uhferret-server [options]
61
+ -h, --help help message
62
+ -p, --port n port number
63
+ -f, --folder FOLDER base folder
64
+
65
+ The folder to store the processed files will default to
66
+ 'FerretFiles' and the port to 2000.
67
+ Initial address: http://localhost:2000/ferret/home
68
+
69
+ NB: The server uses some \*nix commands, and so currently does not work
70
+ under Windows.
71
+
72
+ == Acknowledgements
73
+
74
+ UHFerret has been developed at the University of Hertfordshire by members of
75
+ the Plagiarism Detection Group. The original concept of using trigrams for
76
+ measuring copying was developed by Caroline Lyon and James Malcolm. JunPeng
77
+ Bao, Ruth Barrett and Bob Dickerson also contributed to the development of
78
+ earlier versions of Ferret.
79
+
@@ -0,0 +1,129 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ # This file is part of uhferret, providing a command-line interface.
4
+ #
5
+ # Author:: Peter Lane
6
+ # Copyright:: Copyright 2012-20, Peter Lane.
7
+ # License:: GPLv3
8
+ #
9
+ # uhferret is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # uhferret is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
21
+
22
+ require "optparse"
23
+ require "uhferret"
24
+
25
+ MAX_TABLE_SIZE = 100
26
+ VERSION = "1.3.7"
27
+
28
+ # ------------------------------------------------------------------
29
+ #
30
+ document_type = UHFerret::TextDocument
31
+ input_format = :from_argv
32
+ output_format = :similarity_table
33
+ definitions_file = nil
34
+ output_full_path = false
35
+ xml_output_file = ""
36
+
37
+ options = OptionParser.new do |opts|
38
+ opts.banner = "Usage: uhferret [options] file1 file2 ..."
39
+ opts.on("-h", "--help", "help message") do |v|
40
+ puts options
41
+ exit!
42
+ end
43
+ opts.on("-v", "--version", "version") do |v|
44
+ puts "uhferret: version #{VERSION}"
45
+ exit!
46
+ end
47
+ # -- document type
48
+ opts.on("-c", "--code", "process documents as code") do |v|
49
+ document_type = UHFerret::CodeDocument
50
+ end
51
+ opts.on("-t", "--text", "process documents as text (default)") do |v|
52
+ document_type = UHFerret::TextDocument
53
+ end
54
+ # -- output format
55
+ opts.on("-p", "--full-path", "output full path") do |v|
56
+ output_full_path = true
57
+ end
58
+ opts.on("-d", "--data-table", "output similarity table (default)") do |v|
59
+ output_format = :similarity_table
60
+ end
61
+ opts.on("-w", "--html-data-table", "output similarity table in html format") do |v|
62
+ output_format = :html_similarity_table
63
+ end
64
+ opts.on("-l", "--list-trigrams", "output trigram list") do |v|
65
+ output_format = :trigram_list
66
+ end
67
+ opts.on("-a", "--all-comparisons", "output list of all comparisons") do |v|
68
+ output_format = :all_comparisons
69
+ end
70
+ opts.on("-x", "--xml-report OUTPUT_FILE", "generate xml report from two documents") do |file|
71
+ output_format = :xml_output
72
+ xml_output_file = file
73
+ end
74
+ # -- file source
75
+ opts.on("-f FILE", "--definition-file FILE", "read document names from file") do |file|
76
+ input_format = :from_file
77
+ definitions_file = file
78
+ end
79
+ end
80
+
81
+ begin
82
+ # -- process input options
83
+ options.parse!
84
+
85
+ # -- check some errors
86
+ if output_format == :xml_output
87
+ unless ARGV.size == 2
88
+ puts "Error: for xml report, only provide two input filenames"
89
+ raise ArgumentError.new
90
+ end
91
+ end
92
+
93
+ # -- add readable files, and run
94
+ ferret = UHFerret::Ferret.new
95
+ unless definitions_file.nil?
96
+ if File.readable? definitions_file
97
+ ferret.add_list_from_file(definitions_file, document_type)
98
+ end
99
+ end
100
+ ARGV.each do |filename|
101
+ if File.readable? filename
102
+ ferret.add(filename, document_type)
103
+ end
104
+ end
105
+ if ferret.size < 2
106
+ puts "Error: not enough valid filenames"
107
+ raise ArgumentError.new
108
+ end
109
+ ferret.run
110
+
111
+ # -- display output
112
+ case output_format
113
+ when :similarity_table
114
+ ferret.output_similarity_table output_full_path
115
+ when :html_similarity_table
116
+ ferret.output_html_similarity_table
117
+ when :trigram_list
118
+ ferret.output_trigram_list
119
+ when :all_comparisons
120
+ ferret.output_all_comparisons
121
+ when :xml_output
122
+ ferret.xml_output(xml_output_file, 0, 1)
123
+ end
124
+
125
+ rescue Exception => err
126
+ puts err
127
+ puts options
128
+ end
129
+
@@ -0,0 +1,68 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # This file is part of uhferret, providing a web service interface.
4
+ #
5
+ # Author:: Peter Lane
6
+ # Copyright:: Copyright 2012, Peter Lane.
7
+ # License:: GPLv3
8
+ #
9
+ # uhferret is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # uhferret is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
21
+
22
+ require "optparse"
23
+ require "webferret"
24
+
25
+ # Name of the ferret software
26
+ FERRET = "uhferret"
27
+ # Base folder for working with files
28
+ $base = "#{Dir.pwd}/FerretFiles/"
29
+ # Port number for server
30
+ port = 2000
31
+
32
+ options = OptionParser.new do |opts|
33
+ opts.banner = "Usage: uhferret-server [options]"
34
+ opts.on("-h", "--help", "help message") do |v|
35
+ puts options
36
+ exit!
37
+ end
38
+ # -- port number
39
+ opts.on("-p", "--port n", Integer, "port number") do |v|
40
+ port = v
41
+ end
42
+ # -- base folder
43
+ opts.on("-f", "--folder FOLDER", "base folder") do |v|
44
+ $base = File.expand_path v
45
+ end
46
+ end
47
+
48
+ begin
49
+ options.parse!
50
+
51
+ # Refuse to work if the folder already exists
52
+ if File.exists? $base
53
+ puts "Folder #{$base} already exists"
54
+ puts "Please empty the folder before starting the ferret server"
55
+ exit!
56
+ end
57
+
58
+ # install the ferret server at given Port number
59
+ ferret = HTTPServer.new(:Port => port, :DocumentRoot => "/")
60
+
61
+ ferret.mount "/ferret/home", UHFerret::FerretHomeServlet
62
+ ferret.mount "/ferret/report", UHFerret::FerretReportServlet
63
+
64
+ trap("INT") { ferret.shutdown }
65
+
66
+ ferret.start
67
+ end
68
+
@@ -0,0 +1,231 @@
1
+ #include "document.h"
2
+
3
+ /**
4
+ * This file is part of uhferret.
5
+ *
6
+ * Author:: Peter Lane
7
+ * Copyright:: Copyright 2011, Peter Lane.
8
+ * License:: GPLv3
9
+ *
10
+ * uhferret is free software: you can redistribute it and/or modify
11
+ * it under the terms of the GNU General Public License as published by
12
+ * the Free Software Foundation, either version 3 of the License, or
13
+ * (at your option) any later version.
14
+ *
15
+ * uhferret is distributed in the hope that it will be useful,
16
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
+ * GNU General Public License for more details.
19
+ *
20
+ * You should have received a copy of the GNU General Public License
21
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
22
+ */
23
+
24
+ Document::Document (std::string pathname, DocumentType type, int id)
25
+ : _pathname (pathname),
26
+ _type (type),
27
+ _num_trigrams (0),
28
+ _group_id (id)
29
+ {}
30
+
31
+ Document::Document (Document * document)
32
+ : _pathname (document->_pathname),
33
+ _type (document->_type),
34
+ _num_trigrams (0),
35
+ _group_id (document->_group_id)
36
+ {}
37
+
38
+ void Document::SetType (DocumentType type)
39
+ {
40
+ _type = type;
41
+ }
42
+
43
+ bool Document::IsTextType () const
44
+ {
45
+ return (_type == TypeText);
46
+ }
47
+
48
+ std::string Document::GetPathname () const
49
+ {
50
+ return _pathname;
51
+ }
52
+
53
+ void Document::SetPathname (std::string pathname)
54
+ {
55
+ _pathname = pathname;
56
+ }
57
+
58
+ int Document::GetGroupId () const
59
+ {
60
+ return _group_id;
61
+ }
62
+
63
+ void Document::SetGroupId (int id)
64
+ {
65
+ _group_id = id;
66
+ }
67
+
68
+ int Document::GetTrigramCount () const
69
+ {
70
+ return _num_trigrams;
71
+ }
72
+
73
+ // WARNING: This method should only be used when document definitions are
74
+ // loaded from a file.
75
+ void Document::SetTrigramCount (int count)
76
+ {
77
+ _num_trigrams = count;
78
+ }
79
+
80
+ void Document::ResetTrigramCount ()
81
+ {
82
+ _num_trigrams = 0;
83
+ }
84
+
85
+ void Document::IncrementTrigramCount ()
86
+ {
87
+ _num_trigrams += 1;
88
+ }
89
+
90
+ // Start input from the file referred to by this document
91
+ bool Document::StartInput (TokenSet & tokenset)
92
+ {
93
+ _fb = new std::ifstream(GetPathname().c_str (), std::ifstream::in);
94
+ if (_fb->is_open ())
95
+ {
96
+ _cin = _fb;
97
+ InitialiseInput (tokenset);
98
+ return true; // signify file opened correctly
99
+ }
100
+ else
101
+ {
102
+ return false;
103
+ }
104
+ }
105
+
106
+ // Start input from a provided input stream
107
+ bool Document::StartInput (std::istream & input, TokenSet & tokenset)
108
+ {
109
+ _cin = &input;
110
+ InitialiseInput (tokenset);
111
+ return true;
112
+ }
113
+
114
+ // Start input by constructing a new Reader based on current document type
115
+ // TokenSet is provided by caller, so Reader uses common set of labels for tokens
116
+ void Document::InitialiseInput (TokenSet & tokenset)
117
+ {
118
+ if (_type == TypeText)
119
+ {
120
+ _token_input = new WordReader (* _cin);
121
+ }
122
+ else // (_type == typeCode)
123
+ {
124
+ _token_input = new CCodeReader (* _cin);
125
+ }
126
+ ReadTrigram (tokenset); // read first two tokens so next call to
127
+ ReadTrigram (tokenset); // ReadTrigram returns the first complete trigram
128
+ }
129
+
130
+ // returns true if this document's filetype is the same as the given extension
131
+ // -- note, case is ignored, so "txt" == "TXT" == "tXt"
132
+ bool Document::IsFileType (std::string extension) const
133
+ {
134
+ int dot_posn = _pathname.find_last_of ('.', true); // search for last dot, i.e. from end
135
+ if (dot_posn == _pathname.npos) return false;
136
+ std::string file_extension = _pathname.substr (dot_posn+1, _pathname.npos-(dot_posn+1));
137
+
138
+ return StringToUpper(file_extension) == StringToUpper(extension); // ignore case in comparison
139
+ }
140
+
141
+ std::string Document::StringToUpper (std::string str) const
142
+ {
143
+ std::string nstr = "";
144
+
145
+ for(int i=0, l = str.length (); i < l; i += 1)
146
+ {
147
+ nstr += std::toupper(str[i]);
148
+ }
149
+ return nstr;
150
+ }
151
+
152
+ // Test if file extension represents a c-type language
153
+ bool Document::IsCodeType () const
154
+ {
155
+ return IsFileType ("cpp") ||
156
+ IsFileType ("c") ||
157
+ IsFileType ("java") ||
158
+ IsFileType ("h");
159
+ }
160
+
161
+ // Test if file extension represents a pure text document
162
+ bool Document::IsTxtType () const
163
+ {
164
+ return IsFileType ("txt");
165
+ }
166
+
167
+ // Test if file is not a known type
168
+ bool Document::IsUnknownType () const
169
+ {
170
+ return ! (IsCodeType () || IsTxtType ());
171
+ }
172
+
173
+ // Reads next input token and updates information held on current trigram.
174
+ // return true if a trigram has been read and is ready for retrieval
175
+ bool Document::ReadTrigram (TokenSet & tokenset)
176
+ {
177
+ _current_tuple[0] = _current_tuple[1];
178
+ _current_tuple[1] = _current_tuple[2];
179
+ _current_start[0] = _current_start[1];
180
+ _current_start[1] = _current_start[2];
181
+ if ( _token_input->ReadToken () )
182
+ {
183
+ _current_tuple[2] = _token_input->GetToken (tokenset);
184
+ _current_start[2] = _token_input->GetTokenStart ();
185
+ return true;
186
+ }
187
+ else
188
+ {
189
+ return false;
190
+ }
191
+
192
+ }
193
+
194
+ // retrieve a token of the current tuple, based on position within tuple
195
+ // -- as we only deal with trigrams, index must be in [0,2]
196
+ std::size_t Document::GetToken (int i) const
197
+ {
198
+ assert (i>=0 && i<=2);
199
+ return _current_tuple[i];
200
+ }
201
+
202
+ // retrieve the start position of current trigram
203
+ std::size_t Document::GetTrigramStart () const
204
+ {
205
+ return _current_start[0];
206
+ }
207
+
208
+ // retrieve the start position of token i within current trigram
209
+ // -- used to get start position of second word
210
+ std::size_t Document::GetTrigramStart (int i) const
211
+ {
212
+ assert (i>=0 && i<=2);
213
+ return _current_start[i];
214
+ }
215
+
216
+ // retrieve the end position of the current token
217
+ std::size_t Document::GetTrigramEnd () const
218
+ {
219
+ return _token_input->GetTokenEnd ();
220
+ }
221
+
222
+ // Close up the input file buffer
223
+ void Document::CloseInput ()
224
+ {
225
+ delete _token_input;
226
+ if (_fb->is_open ())
227
+ {
228
+ _fb->close ();
229
+ }
230
+ }
231
+