uhferret 1.3.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/COPYING.txt +674 -0
- data/README.rdoc +79 -0
- data/bin/uhferret +129 -0
- data/bin/uhferret-server +68 -0
- data/ext/document.cpp +231 -0
- data/ext/document.h +89 -0
- data/ext/documentlist.cpp +229 -0
- data/ext/documentlist.h +80 -0
- data/ext/extconf.rb +2 -0
- data/ext/tokenreader.cpp +196 -0
- data/ext/tokenreader.h +85 -0
- data/ext/tokenset.cpp +111 -0
- data/ext/tokenset.h +73 -0
- data/ext/tupleset.cpp +150 -0
- data/ext/tupleset.h +92 -0
- data/ext/uhferret_lib_wrap.cxx +10726 -0
- data/lib/uhferret.rb +441 -0
- data/lib/utils.rb +93 -0
- data/lib/webferret.rb +246 -0
- metadata +71 -0
data/lib/webferret.rb
ADDED
@@ -0,0 +1,246 @@
|
|
1
|
+
#--
|
2
|
+
# This file is part of uhferret.
|
3
|
+
#
|
4
|
+
# Author:: Peter Lane
|
5
|
+
# Copyright:: Copyright 2012, Peter Lane.
|
6
|
+
# License:: GPLv3
|
7
|
+
#
|
8
|
+
# uhferret is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# uhferret is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with uhferret. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
|
21
|
+
require 'find'
|
22
|
+
require 'uhferret'
|
23
|
+
require 'utils'
|
24
|
+
require 'webrick'
|
25
|
+
include WEBrick
|
26
|
+
|
27
|
+
@@next_upload = 0 # global variable, keeps track of number of uploads, for naming folders
|
28
|
+
@@next_report = 0 # global variable, keeps track of number of reports created
|
29
|
+
|
30
|
+
module UHFerret
|
31
|
+
|
32
|
+
# Displays a welcome page, providing a field to upload the zipped file.
|
33
|
+
# On pressing 'submit', runs Ferret and passes results to FerretResultsServlet.
|
34
|
+
class FerretHomeServlet < HTTPServlet::AbstractServlet
|
35
|
+
|
36
|
+
# Returns the 'welcome page' html.
|
37
|
+
def do_GET(req, res)
|
38
|
+
res['Content-Type'] = "text/html"
|
39
|
+
res.body = <<BODY
|
40
|
+
<html><body><h1>Ferret Server</h1>
|
41
|
+
<p>
|
42
|
+
Ferret is a tool for detecting copying in groups of documents,
|
43
|
+
and was created by the (now defunct) Plagiarism Detection Group,
|
44
|
+
University of Hertfordshire.
|
45
|
+
</p>
|
46
|
+
<form method="POST" enctype="multipart/form-data">
|
47
|
+
<p>Compressed file: <input type="file" name="data" size="40">
|
48
|
+
<p><input type="submit"/>
|
49
|
+
</p>
|
50
|
+
</form>
|
51
|
+
|
52
|
+
<hr><h2>Instructions for use</h2>
|
53
|
+
<p>
|
54
|
+
<ol>
|
55
|
+
<li>Construct a compressed folder of your files in a way suitable for
|
56
|
+
your own computer. The Ferret Server will handle a compressed
|
57
|
+
folder in one of the following forms:
|
58
|
+
#{Utils::CompressedFileExtensions.map {|ext| "<tt>.#{ext}</tt>"}.join(", ")}.
|
59
|
+
The files within it may be as:
|
60
|
+
<ul>
|
61
|
+
<li>plain text files</li>
|
62
|
+
#{if Utils.command_present?("abiword")
|
63
|
+
"<li>word-processed files (such as <tt>doc</tt> or <tt>rtf</tt> files)</li>"
|
64
|
+
else
|
65
|
+
""
|
66
|
+
end
|
67
|
+
}
|
68
|
+
#{if Utils.command_present?("pdftotext")
|
69
|
+
"<li><tt>pdf</tt> documents</li>"
|
70
|
+
else
|
71
|
+
""
|
72
|
+
end
|
73
|
+
}
|
74
|
+
</ul>
|
75
|
+
Files may contain natural language text or computer programs (C-type
|
76
|
+
languages).
|
77
|
+
</li>
|
78
|
+
<li>Use the 'Browse' button to select your compressed file.
|
79
|
+
</li>
|
80
|
+
<li>Once Ferret has finished analysing the documents, the display will show
|
81
|
+
a table of the top 100 results.
|
82
|
+
</li>
|
83
|
+
<li>Click on the 'view' link beside each pair to see a report of
|
84
|
+
the comparisons found in that pair of documents. Use the print option of
|
85
|
+
your browser to preserve a copy (e.g. using 'print to pdf').
|
86
|
+
</li>
|
87
|
+
</ol>
|
88
|
+
</p>
|
89
|
+
<hr><font size=-1>Ferret home page generated on: #{Time.now}<br />.
|
90
|
+
</font>
|
91
|
+
</body>
|
92
|
+
</html>
|
93
|
+
BODY
|
94
|
+
end
|
95
|
+
|
96
|
+
# Convenience method to check if a string ends with given ending.
|
97
|
+
def endsWith?(str, str_end)
|
98
|
+
return false if str.length < str_end.length
|
99
|
+
str[-str_end.length .. -1] == str_end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Checks if given _filename_ is an example of a compressed file.
|
103
|
+
def isCompressedFile? filename
|
104
|
+
Utils::CompressedFileExtensions.any? {|e| endsWith?(filename, e) }
|
105
|
+
end
|
106
|
+
|
107
|
+
# If _filename_ names a known compressed file format, it is decompressed
|
108
|
+
# and deleted.
|
109
|
+
def decompress_file filename
|
110
|
+
if endsWith?(filename, "rar")
|
111
|
+
`unrar x #{filename}` if Utils.command_present? "unrar"
|
112
|
+
elsif endsWith?(filename, "tbz2") || endsWith?(filename, "tar.bz2")
|
113
|
+
`tar jxf #{filename}` if Utils.command_present? "tar"
|
114
|
+
elsif endsWith?(filename, "tgz") || endsWith?(filename, "tar.gz")
|
115
|
+
`tar zxf #{filename}` if Utils.command_present? "tar"
|
116
|
+
elsif endsWith?(filename, "zip")
|
117
|
+
`unzip #{filename}` if Utils.command_present? "unzip"
|
118
|
+
end
|
119
|
+
File.delete filename # remove the compressed folder
|
120
|
+
end
|
121
|
+
|
122
|
+
# find all files in given folder and add their names to a definitions file
|
123
|
+
# -- return true if files are text documents, or false if not
|
124
|
+
def create_file_definitions folder
|
125
|
+
text_files = true
|
126
|
+
Dir.chdir folder
|
127
|
+
files = []
|
128
|
+
Find.find(folder) do |filename|
|
129
|
+
next unless File.file?(filename) # ignore directories
|
130
|
+
files << filename
|
131
|
+
text_files = false if Utils.is_code?(filename)
|
132
|
+
end
|
133
|
+
# write the names of valid files into a definitions file
|
134
|
+
File.open("ferret-file-definitions.def", "w") do |defn_file|
|
135
|
+
files.each do |f|
|
136
|
+
defn_file.puts f if Utils.valid_document? f
|
137
|
+
end
|
138
|
+
end
|
139
|
+
return text_files
|
140
|
+
end
|
141
|
+
|
142
|
+
# this method is triggered when the user clicks on 'submit query'
|
143
|
+
def do_POST(req, res)
|
144
|
+
upload_dir = "Upload#{@@next_upload}" # create a unique folder for user's files
|
145
|
+
@@next_upload += 1
|
146
|
+
|
147
|
+
Dir.mkdir $base unless File.exists? $base
|
148
|
+
Dir.mkdir "#{$base}/#{upload_dir}"
|
149
|
+
upload_data = req.query["data"]
|
150
|
+
filename = upload_data.filename.gsub(' ', "-") # replace spaces
|
151
|
+
uploaded_file = "#{$base}/#{upload_dir}/#{filename}"
|
152
|
+
File.open(uploaded_file, "wb") do |file| # do the actual upload of the data
|
153
|
+
upload_data.each_data do |data|
|
154
|
+
file << data.to_s
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
# if uploaded file is a compressed file, then decompress and compute similarities
|
159
|
+
if isCompressedFile?(uploaded_file)
|
160
|
+
Dir.chdir "#{$base}/#{upload_dir}"
|
161
|
+
decompress_file File.basename(uploaded_file)
|
162
|
+
is_text = create_file_definitions Dir.pwd
|
163
|
+
|
164
|
+
# do the computation of similarities
|
165
|
+
# -- output to html table with given folder name, using file definition list
|
166
|
+
`#{FERRET} #{is_text ? "-t" : "-c"} -w -f ferret-file-definitions.def > results.html`
|
167
|
+
res['Content-Type'] = "text/html"
|
168
|
+
res.body = "<meta HTTP-EQUIV=\"REFRESH\" content=\"0; url=#{$base}/#{upload_dir}/results.html\">"
|
169
|
+
else
|
170
|
+
res['Content-Type'] = "text/html"
|
171
|
+
res.body = %{<html><body><h1>Error</h1>
|
172
|
+
<p>You did not submit a valid zip file.</p>
|
173
|
+
<p><a href="/ferret/home">Return to Ferret home page</a>.</p>
|
174
|
+
</body></html>}
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
# This servlet is triggered by a click on 'Download' link in report table
|
180
|
+
# It creates the xml report comparing two documents
|
181
|
+
class FerretReportServlet < HTTPServlet::AbstractServlet
|
182
|
+
|
183
|
+
# Handles the request to create a report in xml format.
|
184
|
+
def do_GET(req, res)
|
185
|
+
upload_dir = req.query['upload']
|
186
|
+
file1 = req.query['file1'].gsub("%20", "\ ")
|
187
|
+
file2 = req.query['file2'].gsub("%20", "\ ")
|
188
|
+
report_name = "#{upload_dir}/report#{@@next_report}.xml"
|
189
|
+
@@next_report += 1
|
190
|
+
Dir.chdir "#{upload_dir}"
|
191
|
+
|
192
|
+
`#{FERRET} #{Utils.is_code?(file1) ? "-c" : "-t"} -x "#{report_name}" "#{file1}" "#{file2}"`
|
193
|
+
write_style_sheet File.dirname(report_name)
|
194
|
+
res['Content-Type'] = "text/html"
|
195
|
+
res.body = "<meta HTTP-EQUIV=\"REFRESH\" content=\"0; url=#{report_name}\">"
|
196
|
+
end
|
197
|
+
|
198
|
+
private
|
199
|
+
def write_style_sheet dir
|
200
|
+
File.open("#{dir}/uhferret.xsl", "w") do |f|
|
201
|
+
f.puts <<STYLESHEET
|
202
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
203
|
+
|
204
|
+
<html xsl:version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns="http://www.w3.org/1999/xhtml">
|
205
|
+
<head>
|
206
|
+
<style> <!-- style sheet for document -->
|
207
|
+
h1 {background-color: #d0d0d0} <!-- add a background to make headings stand out -->
|
208
|
+
h2 {background-color: #d0d0d0}
|
209
|
+
.highlight {font-weight:bold; color:blue} <!-- highlighted text style -->
|
210
|
+
.normal {font-weight:normal} <!-- normal text style -->
|
211
|
+
</style>
|
212
|
+
</head>
|
213
|
+
<body>
|
214
|
+
<h1>UH-Ferret: Document comparison</h1>
|
215
|
+
|
216
|
+
<!-- display top-level information -->
|
217
|
+
<p>Common trigrams: <xsl:value-of select="uhferret/common-trigrams"/></p>
|
218
|
+
<p>Similarity: <xsl:value-of select="uhferret/similarity"/></p>
|
219
|
+
|
220
|
+
<!-- work through each document -->
|
221
|
+
<xsl:for-each select="uhferret/document">
|
222
|
+
<!-- display document-level information -->
|
223
|
+
<h2>Document: <xsl:value-of select="source"/></h2>
|
224
|
+
<p>Number of trigrams: <xsl:value-of select="num-trigrams"/></p>
|
225
|
+
<p>Containment in other document: <xsl:value-of select="containment"/></p>
|
226
|
+
|
227
|
+
<!-- work through each block in text, displaying as highlighted or normal -->
|
228
|
+
<pre>
|
229
|
+
<xsl:for-each select="text/block">
|
230
|
+
<xsl:if test="@text='copied'">
|
231
|
+
<span class="highlight"><xsl:value-of select="."/></span>
|
232
|
+
</xsl:if>
|
233
|
+
<xsl:if test="@text='normal'">
|
234
|
+
<span style="normal"><xsl:value-of select="."/></span>
|
235
|
+
</xsl:if>
|
236
|
+
</xsl:for-each>
|
237
|
+
</pre>
|
238
|
+
</xsl:for-each>
|
239
|
+
</body>
|
240
|
+
</html>
|
241
|
+
STYLESHEET
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
end
|
metadata
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: uhferret
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.3.7
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Peter Lane
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-11-10 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: "UHFerret is a copy-detection tool to analyse large sets of documents
|
14
|
+
\nto find pairs of documents with substantial amounts of lexical copying. \nDocuments
|
15
|
+
containing either natural language (e.g. English) or computer \nprograms (in C-family)
|
16
|
+
may be processed. \n"
|
17
|
+
email: peterlane@gmx.com
|
18
|
+
executables:
|
19
|
+
- uhferret
|
20
|
+
- uhferret-server
|
21
|
+
extensions:
|
22
|
+
- ext/extconf.rb
|
23
|
+
extra_rdoc_files:
|
24
|
+
- README.rdoc
|
25
|
+
- COPYING.txt
|
26
|
+
files:
|
27
|
+
- COPYING.txt
|
28
|
+
- README.rdoc
|
29
|
+
- bin/uhferret
|
30
|
+
- bin/uhferret-server
|
31
|
+
- ext/document.cpp
|
32
|
+
- ext/document.h
|
33
|
+
- ext/documentlist.cpp
|
34
|
+
- ext/documentlist.h
|
35
|
+
- ext/extconf.rb
|
36
|
+
- ext/tokenreader.cpp
|
37
|
+
- ext/tokenreader.h
|
38
|
+
- ext/tokenset.cpp
|
39
|
+
- ext/tokenset.h
|
40
|
+
- ext/tupleset.cpp
|
41
|
+
- ext/tupleset.h
|
42
|
+
- ext/uhferret_lib_wrap.cxx
|
43
|
+
- lib/uhferret.rb
|
44
|
+
- lib/utils.rb
|
45
|
+
- lib/webferret.rb
|
46
|
+
homepage: https://peterlane.netlify.app/ferret/
|
47
|
+
licenses:
|
48
|
+
- GPL-3.0+
|
49
|
+
metadata: {}
|
50
|
+
post_install_message:
|
51
|
+
rdoc_options:
|
52
|
+
- "-m"
|
53
|
+
- README.rdoc
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '2.5'
|
61
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
requirements: []
|
67
|
+
rubygems_version: 3.1.4
|
68
|
+
signing_key:
|
69
|
+
specification_version: 4
|
70
|
+
summary: UHFerret is a copy-detection tool.
|
71
|
+
test_files: []
|