uhferret 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,79 @@
1
+ = UHFerret
2
+
3
+ homepage:: https://peterlane.netlify.org/ferret/
4
+ source:: https://notabug.org/peterlane/uhferret-gem/releases
5
+
6
+ == Description
7
+
8
+ UHFerret is a copy-detection tool, supporting the analysis of large sets of
9
+ documents to find pairs of documents with substantial amounts of lexical
10
+ copying. Documents containing either natural language (e.g. English) or
11
+ computer programs (in C-family) may be processed.
12
+
13
+ This library provides a Ruby wrapper around uhferret suitable for
14
+ scripting, a command-line executable, 'uhferret', and a simple
15
+ server version, 'uhferret-server'.
16
+
17
+ NB: to install uhferret, Ruby must be able to compile and build C extensions.
18
+
19
+ == Use
20
+
21
+ === Command Line
22
+
23
+ Usage: uhferret [options] file1 file2 ...
24
+ -h, --help help message
25
+ -c, --code process documents as code
26
+ -t, --text process documents as text (default)
27
+ -d, --data-table output similarity table (default)
28
+ -l, --list-trigrams output trigram list
29
+ -a, --all-comparisons output list of all comparisons
30
+ -x, --xml-report FILE generate xml report from two documents
31
+ -f, --definition-file FILE read document names from file
32
+
33
+ To compute the similarities of a set of files, use:
34
+
35
+ $ uhferret file1.txt file2.txt ...
36
+
37
+ An xml output can be generated for a pair of files using:
38
+
39
+ $ uhferret -x outfile.xml file1.txt file2.txt
40
+
41
+ The xml output can be displayed in a browser using the style sheet
42
+ 'uhferret.xsl' in the examples folder, and then printed from the browser.
43
+
44
+ === Program
45
+
46
+ Ferret can also be used as a library, and called from within a program.
47
+ For example:
48
+
49
+ ferret = Ferret.new
50
+ ferret.add 'filename1.txt'
51
+ ferret.add 'filename2.txt'
52
+ ferret.run
53
+ ferret.output_similarity_table
54
+
55
+ Will create a new instance of Ferret, add two documents, run and then output the
56
+ similarity between the two.
57
+
58
+ === Server
59
+
60
+ Usage: uhferret-server [options]
61
+ -h, --help help message
62
+ -p, --port n port number
63
+ -f, --folder FOLDER base folder
64
+
65
+ The folder to store the processed files will default to
66
+ 'FerretFiles' and the port to 2000.
67
+ Initial address: http://localhost:2000/ferret/home
68
+
69
+ NB: The server uses some \*nix commands, and so currently does not work
70
+ under Windows.
71
+
72
+ == Acknowledgements
73
+
74
+ UHFerret has been developed at the University of Hertfordshire by members of
75
+ the Plagiarism Detection Group. The original concept of using trigrams for
76
+ measuring copying was developed by Caroline Lyon and James Malcolm. JunPeng
77
+ Bao, Ruth Barrett and Bob Dickerson also contributed to the development of
78
+ earlier versions of Ferret.
79
+
@@ -0,0 +1,129 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ # This file is part of uhferret, providing a command-line interface.
4
+ #
5
+ # Author:: Peter Lane
6
+ # Copyright:: Copyright 2012-20, Peter Lane.
7
+ # License:: GPLv3
8
+ #
9
+ # uhferret is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # uhferret is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
21
+
22
+ require "optparse"
23
+ require "uhferret"
24
+
25
+ MAX_TABLE_SIZE = 100
26
+ VERSION = "1.3.7"
27
+
28
+ # ------------------------------------------------------------------
29
+ #
30
+ document_type = UHFerret::TextDocument
31
+ input_format = :from_argv
32
+ output_format = :similarity_table
33
+ definitions_file = nil
34
+ output_full_path = false
35
+ xml_output_file = ""
36
+
37
+ options = OptionParser.new do |opts|
38
+ opts.banner = "Usage: uhferret [options] file1 file2 ..."
39
+ opts.on("-h", "--help", "help message") do |v|
40
+ puts options
41
+ exit!
42
+ end
43
+ opts.on("-v", "--version", "version") do |v|
44
+ puts "uhferret: version #{VERSION}"
45
+ exit!
46
+ end
47
+ # -- document type
48
+ opts.on("-c", "--code", "process documents as code") do |v|
49
+ document_type = UHFerret::CodeDocument
50
+ end
51
+ opts.on("-t", "--text", "process documents as text (default)") do |v|
52
+ document_type = UHFerret::TextDocument
53
+ end
54
+ # -- output format
55
+ opts.on("-p", "--full-path", "output full path") do |v|
56
+ output_full_path = true
57
+ end
58
+ opts.on("-d", "--data-table", "output similarity table (default)") do |v|
59
+ output_format = :similarity_table
60
+ end
61
+ opts.on("-w", "--html-data-table", "output similarity table in html format") do |v|
62
+ output_format = :html_similarity_table
63
+ end
64
+ opts.on("-l", "--list-trigrams", "output trigram list") do |v|
65
+ output_format = :trigram_list
66
+ end
67
+ opts.on("-a", "--all-comparisons", "output list of all comparisons") do |v|
68
+ output_format = :all_comparisons
69
+ end
70
+ opts.on("-x", "--xml-report OUTPUT_FILE", "generate xml report from two documents") do |file|
71
+ output_format = :xml_output
72
+ xml_output_file = file
73
+ end
74
+ # -- file source
75
+ opts.on("-f FILE", "--definition-file FILE", "read document names from file") do |file|
76
+ input_format = :from_file
77
+ definitions_file = file
78
+ end
79
+ end
80
+
81
+ begin
82
+ # -- process input options
83
+ options.parse!
84
+
85
+ # -- check some errors
86
+ if output_format == :xml_output
87
+ unless ARGV.size == 2
88
+ puts "Error: for xml report, only provide two input filenames"
89
+ raise ArgumentError.new
90
+ end
91
+ end
92
+
93
+ # -- add readable files, and run
94
+ ferret = UHFerret::Ferret.new
95
+ unless definitions_file.nil?
96
+ if File.readable? definitions_file
97
+ ferret.add_list_from_file(definitions_file, document_type)
98
+ end
99
+ end
100
+ ARGV.each do |filename|
101
+ if File.readable? filename
102
+ ferret.add(filename, document_type)
103
+ end
104
+ end
105
+ if ferret.size < 2
106
+ puts "Error: not enough valid filenames"
107
+ raise ArgumentError.new
108
+ end
109
+ ferret.run
110
+
111
+ # -- display output
112
+ case output_format
113
+ when :similarity_table
114
+ ferret.output_similarity_table output_full_path
115
+ when :html_similarity_table
116
+ ferret.output_html_similarity_table
117
+ when :trigram_list
118
+ ferret.output_trigram_list
119
+ when :all_comparisons
120
+ ferret.output_all_comparisons
121
+ when :xml_output
122
+ ferret.xml_output(xml_output_file, 0, 1)
123
+ end
124
+
125
+ rescue Exception => err
126
+ puts err
127
+ puts options
128
+ end
129
+
@@ -0,0 +1,68 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # This file is part of uhferret, providing a web service interface.
4
+ #
5
+ # Author:: Peter Lane
6
+ # Copyright:: Copyright 2012, Peter Lane.
7
+ # License:: GPLv3
8
+ #
9
+ # uhferret is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # uhferret is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
21
+
22
+ require "optparse"
23
+ require "webferret"
24
+
25
+ # Name of the ferret software
26
+ FERRET = "uhferret"
27
+ # Base folder for working with files
28
+ $base = "#{Dir.pwd}/FerretFiles/"
29
+ # Port number for server
30
+ port = 2000
31
+
32
+ options = OptionParser.new do |opts|
33
+ opts.banner = "Usage: uhferret-server [options]"
34
+ opts.on("-h", "--help", "help message") do |v|
35
+ puts options
36
+ exit!
37
+ end
38
+ # -- port number
39
+ opts.on("-p", "--port n", Integer, "port number") do |v|
40
+ port = v
41
+ end
42
+ # -- base folder
43
+ opts.on("-f", "--folder FOLDER", "base folder") do |v|
44
+ $base = File.expand_path v
45
+ end
46
+ end
47
+
48
+ begin
49
+ options.parse!
50
+
51
+ # Refuse to work if the folder already exists
52
+ if File.exists? $base
53
+ puts "Folder #{$base} already exists"
54
+ puts "Please empty the folder before starting the ferret server"
55
+ exit!
56
+ end
57
+
58
+ # install the ferret server at given Port number
59
+ ferret = HTTPServer.new(:Port => port, :DocumentRoot => "/")
60
+
61
+ ferret.mount "/ferret/home", UHFerret::FerretHomeServlet
62
+ ferret.mount "/ferret/report", UHFerret::FerretReportServlet
63
+
64
+ trap("INT") { ferret.shutdown }
65
+
66
+ ferret.start
67
+ end
68
+
@@ -0,0 +1,231 @@
1
+ #include "document.h"
2
+
3
+ /**
4
+ * This file is part of uhferret.
5
+ *
6
+ * Author:: Peter Lane
7
+ * Copyright:: Copyright 2011, Peter Lane.
8
+ * License:: GPLv3
9
+ *
10
+ * uhferret is free software: you can redistribute it and/or modify
11
+ * it under the terms of the GNU General Public License as published by
12
+ * the Free Software Foundation, either version 3 of the License, or
13
+ * (at your option) any later version.
14
+ *
15
+ * uhferret is distributed in the hope that it will be useful,
16
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
+ * GNU General Public License for more details.
19
+ *
20
+ * You should have received a copy of the GNU General Public License
21
+ * along with uhferret. If not, see <http://www.gnu.org/licenses/>.
22
+ */
23
+
24
+ Document::Document (std::string pathname, DocumentType type, int id)
25
+ : _pathname (pathname),
26
+ _type (type),
27
+ _num_trigrams (0),
28
+ _group_id (id)
29
+ {}
30
+
31
+ Document::Document (Document * document)
32
+ : _pathname (document->_pathname),
33
+ _type (document->_type),
34
+ _num_trigrams (0),
35
+ _group_id (document->_group_id)
36
+ {}
37
+
38
+ void Document::SetType (DocumentType type)
39
+ {
40
+ _type = type;
41
+ }
42
+
43
+ bool Document::IsTextType () const
44
+ {
45
+ return (_type == TypeText);
46
+ }
47
+
48
+ std::string Document::GetPathname () const
49
+ {
50
+ return _pathname;
51
+ }
52
+
53
+ void Document::SetPathname (std::string pathname)
54
+ {
55
+ _pathname = pathname;
56
+ }
57
+
58
+ int Document::GetGroupId () const
59
+ {
60
+ return _group_id;
61
+ }
62
+
63
+ void Document::SetGroupId (int id)
64
+ {
65
+ _group_id = id;
66
+ }
67
+
68
+ int Document::GetTrigramCount () const
69
+ {
70
+ return _num_trigrams;
71
+ }
72
+
73
+ // WARNING: This method should only be used when document definitions are
74
+ // loaded from a file.
75
+ void Document::SetTrigramCount (int count)
76
+ {
77
+ _num_trigrams = count;
78
+ }
79
+
80
+ void Document::ResetTrigramCount ()
81
+ {
82
+ _num_trigrams = 0;
83
+ }
84
+
85
+ void Document::IncrementTrigramCount ()
86
+ {
87
+ _num_trigrams += 1;
88
+ }
89
+
90
+ // Start input from the file referred to by this document
91
+ bool Document::StartInput (TokenSet & tokenset)
92
+ {
93
+ _fb = new std::ifstream(GetPathname().c_str (), std::ifstream::in);
94
+ if (_fb->is_open ())
95
+ {
96
+ _cin = _fb;
97
+ InitialiseInput (tokenset);
98
+ return true; // signify file opened correctly
99
+ }
100
+ else
101
+ {
102
+ return false;
103
+ }
104
+ }
105
+
106
+ // Start input from a provided input stream
107
+ bool Document::StartInput (std::istream & input, TokenSet & tokenset)
108
+ {
109
+ _cin = &input;
110
+ InitialiseInput (tokenset);
111
+ return true;
112
+ }
113
+
114
+ // Start input by constructing a new Reader based on current document type
115
+ // TokenSet is provided by caller, so Reader uses common set of labels for tokens
116
+ void Document::InitialiseInput (TokenSet & tokenset)
117
+ {
118
+ if (_type == TypeText)
119
+ {
120
+ _token_input = new WordReader (* _cin);
121
+ }
122
+ else // (_type == typeCode)
123
+ {
124
+ _token_input = new CCodeReader (* _cin);
125
+ }
126
+ ReadTrigram (tokenset); // read first two tokens so next call to
127
+ ReadTrigram (tokenset); // ReadTrigram returns the first complete trigram
128
+ }
129
+
130
+ // returns true if this document's filetype is the same as the given extension
131
+ // -- note, case is ignored, so "txt" == "TXT" == "tXt"
132
+ bool Document::IsFileType (std::string extension) const
133
+ {
134
+ int dot_posn = _pathname.find_last_of ('.', true); // search for last dot, i.e. from end
135
+ if (dot_posn == _pathname.npos) return false;
136
+ std::string file_extension = _pathname.substr (dot_posn+1, _pathname.npos-(dot_posn+1));
137
+
138
+ return StringToUpper(file_extension) == StringToUpper(extension); // ignore case in comparison
139
+ }
140
+
141
+ std::string Document::StringToUpper (std::string str) const
142
+ {
143
+ std::string nstr = "";
144
+
145
+ for(int i=0, l = str.length (); i < l; i += 1)
146
+ {
147
+ nstr += std::toupper(str[i]);
148
+ }
149
+ return nstr;
150
+ }
151
+
152
+ // Test if file extension represents a c-type language
153
+ bool Document::IsCodeType () const
154
+ {
155
+ return IsFileType ("cpp") ||
156
+ IsFileType ("c") ||
157
+ IsFileType ("java") ||
158
+ IsFileType ("h");
159
+ }
160
+
161
+ // Test if file extension represents a pure text document
162
+ bool Document::IsTxtType () const
163
+ {
164
+ return IsFileType ("txt");
165
+ }
166
+
167
+ // Test if file is not a known type
168
+ bool Document::IsUnknownType () const
169
+ {
170
+ return ! (IsCodeType () || IsTxtType ());
171
+ }
172
+
173
+ // Reads next input token and updates information held on current trigram.
174
+ // return true if a trigram has been read and is ready for retrieval
175
+ bool Document::ReadTrigram (TokenSet & tokenset)
176
+ {
177
+ _current_tuple[0] = _current_tuple[1];
178
+ _current_tuple[1] = _current_tuple[2];
179
+ _current_start[0] = _current_start[1];
180
+ _current_start[1] = _current_start[2];
181
+ if ( _token_input->ReadToken () )
182
+ {
183
+ _current_tuple[2] = _token_input->GetToken (tokenset);
184
+ _current_start[2] = _token_input->GetTokenStart ();
185
+ return true;
186
+ }
187
+ else
188
+ {
189
+ return false;
190
+ }
191
+
192
+ }
193
+
194
+ // retrieve a token of the current tuple, based on position within tuple
195
+ // -- as we only deal with trigrams, index must be in [0,2]
196
+ std::size_t Document::GetToken (int i) const
197
+ {
198
+ assert (i>=0 && i<=2);
199
+ return _current_tuple[i];
200
+ }
201
+
202
+ // retrieve the start position of current trigram
203
+ std::size_t Document::GetTrigramStart () const
204
+ {
205
+ return _current_start[0];
206
+ }
207
+
208
+ // retrieve the start position of token i within current trigram
209
+ // -- used to get start position of second word
210
+ std::size_t Document::GetTrigramStart (int i) const
211
+ {
212
+ assert (i>=0 && i<=2);
213
+ return _current_start[i];
214
+ }
215
+
216
+ // retrieve the end position of the current token
217
+ std::size_t Document::GetTrigramEnd () const
218
+ {
219
+ return _token_input->GetTokenEnd ();
220
+ }
221
+
222
+ // Close up the input file buffer
223
+ void Document::CloseInput ()
224
+ {
225
+ delete _token_input;
226
+ if (_fb->is_open ())
227
+ {
228
+ _fb->close ();
229
+ }
230
+ }
231
+