uhferret 1.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/COPYING.txt +674 -0
- data/README.rdoc +79 -0
- data/bin/uhferret +129 -0
- data/bin/uhferret-server +68 -0
- data/ext/document.cpp +231 -0
- data/ext/document.h +89 -0
- data/ext/documentlist.cpp +229 -0
- data/ext/documentlist.h +80 -0
- data/ext/extconf.rb +2 -0
- data/ext/tokenreader.cpp +196 -0
- data/ext/tokenreader.h +85 -0
- data/ext/tokenset.cpp +111 -0
- data/ext/tokenset.h +73 -0
- data/ext/tupleset.cpp +150 -0
- data/ext/tupleset.h +92 -0
- data/ext/uhferret_lib_wrap.cxx +10726 -0
- data/lib/uhferret.rb +441 -0
- data/lib/utils.rb +93 -0
- data/lib/webferret.rb +246 -0
- metadata +71 -0
data/lib/webferret.rb
ADDED
@@ -0,0 +1,246 @@
|
|
1
|
+
#--
|
2
|
+
# This file is part of uhferret.
|
3
|
+
#
|
4
|
+
# Author:: Peter Lane
|
5
|
+
# Copyright:: Copyright 2012, Peter Lane.
|
6
|
+
# License:: GPLv3
|
7
|
+
#
|
8
|
+
# uhferret is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# uhferret is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with uhferret. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
|
21
|
+
require 'find'
|
22
|
+
require 'uhferret'
|
23
|
+
require 'utils'
|
24
|
+
require 'webrick'
|
25
|
+
include WEBrick
|
26
|
+
|
27
|
+
@@next_upload = 0 # global variable, keeps track of number of uploads, for naming folders
|
28
|
+
@@next_report = 0 # global variable, keeps track of number of reports created
|
29
|
+
|
30
|
+
module UHFerret
|
31
|
+
|
32
|
+
# Displays a welcome page, providing a field to upload the zipped file.
|
33
|
+
# On pressing 'submit', runs Ferret and passes results to FerretResultsServlet.
|
34
|
+
class FerretHomeServlet < HTTPServlet::AbstractServlet
|
35
|
+
|
36
|
+
# Returns the 'welcome page' html.
|
37
|
+
def do_GET(req, res)
|
38
|
+
res['Content-Type'] = "text/html"
|
39
|
+
res.body = <<BODY
|
40
|
+
<html><body><h1>Ferret Server</h1>
|
41
|
+
<p>
|
42
|
+
Ferret is a tool for detecting copying in groups of documents,
|
43
|
+
and was created by the (now defunct) Plagiarism Detection Group,
|
44
|
+
University of Hertfordshire.
|
45
|
+
</p>
|
46
|
+
<form method="POST" enctype="multipart/form-data">
|
47
|
+
<p>Compressed file: <input type="file" name="data" size="40">
|
48
|
+
<p><input type="submit"/>
|
49
|
+
</p>
|
50
|
+
</form>
|
51
|
+
|
52
|
+
<hr><h2>Instructions for use</h2>
|
53
|
+
<p>
|
54
|
+
<ol>
|
55
|
+
<li>Construct a compressed folder of your files in a way suitable for
|
56
|
+
your own computer. The Ferret Server will handle a compressed
|
57
|
+
folder in one of the following forms:
|
58
|
+
#{Utils::CompressedFileExtensions.map {|ext| "<tt>.#{ext}</tt>"}.join(", ")}.
|
59
|
+
The files within it may be as:
|
60
|
+
<ul>
|
61
|
+
<li>plain text files</li>
|
62
|
+
#{if Utils.command_present?("abiword")
|
63
|
+
"<li>word-processed files (such as <tt>doc</tt> or <tt>rtf</tt> files)</li>"
|
64
|
+
else
|
65
|
+
""
|
66
|
+
end
|
67
|
+
}
|
68
|
+
#{if Utils.command_present?("pdftotext")
|
69
|
+
"<li><tt>pdf</tt> documents</li>"
|
70
|
+
else
|
71
|
+
""
|
72
|
+
end
|
73
|
+
}
|
74
|
+
</ul>
|
75
|
+
Files may contain natural language text or computer programs (C-type
|
76
|
+
languages).
|
77
|
+
</li>
|
78
|
+
<li>Use the 'Browse' button to select your compressed file.
|
79
|
+
</li>
|
80
|
+
<li>Once Ferret has finished analysing the documents, the display will show
|
81
|
+
a table of the top 100 results.
|
82
|
+
</li>
|
83
|
+
<li>Click on the 'view' link beside each pair to see a report of
|
84
|
+
the comparisons found in that pair of documents. Use the print option of
|
85
|
+
your browser to preserve a copy (e.g. using 'print to pdf').
|
86
|
+
</li>
|
87
|
+
</ol>
|
88
|
+
</p>
|
89
|
+
<hr><font size=-1>Ferret home page generated on: #{Time.now}<br />.
|
90
|
+
</font>
|
91
|
+
</body>
|
92
|
+
</html>
|
93
|
+
BODY
|
94
|
+
end
|
95
|
+
|
96
|
+
# Convenience method to check if a string ends with given ending.
|
97
|
+
def endsWith?(str, str_end)
|
98
|
+
return false if str.length < str_end.length
|
99
|
+
str[-str_end.length .. -1] == str_end
|
100
|
+
end
|
101
|
+
|
102
|
+
# Checks if given _filename_ is an example of a compressed file.
|
103
|
+
def isCompressedFile? filename
|
104
|
+
Utils::CompressedFileExtensions.any? {|e| endsWith?(filename, e) }
|
105
|
+
end
|
106
|
+
|
107
|
+
# If _filename_ names a known compressed file format, it is decompressed
|
108
|
+
# and deleted.
|
109
|
+
def decompress_file filename
|
110
|
+
if endsWith?(filename, "rar")
|
111
|
+
`unrar x #{filename}` if Utils.command_present? "unrar"
|
112
|
+
elsif endsWith?(filename, "tbz2") || endsWith?(filename, "tar.bz2")
|
113
|
+
`tar jxf #{filename}` if Utils.command_present? "tar"
|
114
|
+
elsif endsWith?(filename, "tgz") || endsWith?(filename, "tar.gz")
|
115
|
+
`tar zxf #{filename}` if Utils.command_present? "tar"
|
116
|
+
elsif endsWith?(filename, "zip")
|
117
|
+
`unzip #{filename}` if Utils.command_present? "unzip"
|
118
|
+
end
|
119
|
+
File.delete filename # remove the compressed folder
|
120
|
+
end
|
121
|
+
|
122
|
+
# find all files in given folder and add their names to a definitions file
|
123
|
+
# -- return true if files are text documents, or false if not
|
124
|
+
def create_file_definitions folder
|
125
|
+
text_files = true
|
126
|
+
Dir.chdir folder
|
127
|
+
files = []
|
128
|
+
Find.find(folder) do |filename|
|
129
|
+
next unless File.file?(filename) # ignore directories
|
130
|
+
files << filename
|
131
|
+
text_files = false if Utils.is_code?(filename)
|
132
|
+
end
|
133
|
+
# write the names of valid files into a definitions file
|
134
|
+
File.open("ferret-file-definitions.def", "w") do |defn_file|
|
135
|
+
files.each do |f|
|
136
|
+
defn_file.puts f if Utils.valid_document? f
|
137
|
+
end
|
138
|
+
end
|
139
|
+
return text_files
|
140
|
+
end
|
141
|
+
|
142
|
+
# this method is triggered when the user clicks on 'submit query'
|
143
|
+
def do_POST(req, res)
|
144
|
+
upload_dir = "Upload#{@@next_upload}" # create a unique folder for user's files
|
145
|
+
@@next_upload += 1
|
146
|
+
|
147
|
+
Dir.mkdir $base unless File.exists? $base
|
148
|
+
Dir.mkdir "#{$base}/#{upload_dir}"
|
149
|
+
upload_data = req.query["data"]
|
150
|
+
filename = upload_data.filename.gsub(' ', "-") # replace spaces
|
151
|
+
uploaded_file = "#{$base}/#{upload_dir}/#{filename}"
|
152
|
+
File.open(uploaded_file, "wb") do |file| # do the actual upload of the data
|
153
|
+
upload_data.each_data do |data|
|
154
|
+
file << data.to_s
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
# if uploaded file is a compressed file, then decompress and compute similarities
|
159
|
+
if isCompressedFile?(uploaded_file)
|
160
|
+
Dir.chdir "#{$base}/#{upload_dir}"
|
161
|
+
decompress_file File.basename(uploaded_file)
|
162
|
+
is_text = create_file_definitions Dir.pwd
|
163
|
+
|
164
|
+
# do the computation of similarities
|
165
|
+
# -- output to html table with given folder name, using file definition list
|
166
|
+
`#{FERRET} #{is_text ? "-t" : "-c"} -w -f ferret-file-definitions.def > results.html`
|
167
|
+
res['Content-Type'] = "text/html"
|
168
|
+
res.body = "<meta HTTP-EQUIV=\"REFRESH\" content=\"0; url=#{$base}/#{upload_dir}/results.html\">"
|
169
|
+
else
|
170
|
+
res['Content-Type'] = "text/html"
|
171
|
+
res.body = %{<html><body><h1>Error</h1>
|
172
|
+
<p>You did not submit a valid zip file.</p>
|
173
|
+
<p><a href="/ferret/home">Return to Ferret home page</a>.</p>
|
174
|
+
</body></html>}
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
# This servlet is triggered by a click on 'Download' link in report table
|
180
|
+
# It creates the xml report comparing two documents
|
181
|
+
class FerretReportServlet < HTTPServlet::AbstractServlet
|
182
|
+
|
183
|
+
# Handles the request to create a report in xml format.
|
184
|
+
def do_GET(req, res)
|
185
|
+
upload_dir = req.query['upload']
|
186
|
+
file1 = req.query['file1'].gsub("%20", "\ ")
|
187
|
+
file2 = req.query['file2'].gsub("%20", "\ ")
|
188
|
+
report_name = "#{upload_dir}/report#{@@next_report}.xml"
|
189
|
+
@@next_report += 1
|
190
|
+
Dir.chdir "#{upload_dir}"
|
191
|
+
|
192
|
+
`#{FERRET} #{Utils.is_code?(file1) ? "-c" : "-t"} -x "#{report_name}" "#{file1}" "#{file2}"`
|
193
|
+
write_style_sheet File.dirname(report_name)
|
194
|
+
res['Content-Type'] = "text/html"
|
195
|
+
res.body = "<meta HTTP-EQUIV=\"REFRESH\" content=\"0; url=#{report_name}\">"
|
196
|
+
end
|
197
|
+
|
198
|
+
private
|
199
|
+
def write_style_sheet dir
|
200
|
+
File.open("#{dir}/uhferret.xsl", "w") do |f|
|
201
|
+
f.puts <<STYLESHEET
|
202
|
+
<?xml version="1.0" encoding="ISO-8859-1"?>
|
203
|
+
|
204
|
+
<html xsl:version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns="http://www.w3.org/1999/xhtml">
|
205
|
+
<head>
|
206
|
+
<style> <!-- style sheet for document -->
|
207
|
+
h1 {background-color: #d0d0d0} <!-- add a background to make headings stand out -->
|
208
|
+
h2 {background-color: #d0d0d0}
|
209
|
+
.highlight {font-weight:bold; color:blue} <!-- highlighted text style -->
|
210
|
+
.normal {font-weight:normal} <!-- normal text style -->
|
211
|
+
</style>
|
212
|
+
</head>
|
213
|
+
<body>
|
214
|
+
<h1>UH-Ferret: Document comparison</h1>
|
215
|
+
|
216
|
+
<!-- display top-level information -->
|
217
|
+
<p>Common trigrams: <xsl:value-of select="uhferret/common-trigrams"/></p>
|
218
|
+
<p>Similarity: <xsl:value-of select="uhferret/similarity"/></p>
|
219
|
+
|
220
|
+
<!-- work through each document -->
|
221
|
+
<xsl:for-each select="uhferret/document">
|
222
|
+
<!-- display document-level information -->
|
223
|
+
<h2>Document: <xsl:value-of select="source"/></h2>
|
224
|
+
<p>Number of trigrams: <xsl:value-of select="num-trigrams"/></p>
|
225
|
+
<p>Containment in other document: <xsl:value-of select="containment"/></p>
|
226
|
+
|
227
|
+
<!-- work through each block in text, displaying as highlighted or normal -->
|
228
|
+
<pre>
|
229
|
+
<xsl:for-each select="text/block">
|
230
|
+
<xsl:if test="@text='copied'">
|
231
|
+
<span class="highlight"><xsl:value-of select="."/></span>
|
232
|
+
</xsl:if>
|
233
|
+
<xsl:if test="@text='normal'">
|
234
|
+
<span style="normal"><xsl:value-of select="."/></span>
|
235
|
+
</xsl:if>
|
236
|
+
</xsl:for-each>
|
237
|
+
</pre>
|
238
|
+
</xsl:for-each>
|
239
|
+
</body>
|
240
|
+
</html>
|
241
|
+
STYLESHEET
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
end
|
metadata
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: uhferret
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.3.7
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Peter Lane
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-11-10 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: "UHFerret is a copy-detection tool to analyse large sets of documents
|
14
|
+
\nto find pairs of documents with substantial amounts of lexical copying. \nDocuments
|
15
|
+
containing either natural language (e.g. English) or computer \nprograms (in C-family)
|
16
|
+
may be processed. \n"
|
17
|
+
email: peterlane@gmx.com
|
18
|
+
executables:
|
19
|
+
- uhferret
|
20
|
+
- uhferret-server
|
21
|
+
extensions:
|
22
|
+
- ext/extconf.rb
|
23
|
+
extra_rdoc_files:
|
24
|
+
- README.rdoc
|
25
|
+
- COPYING.txt
|
26
|
+
files:
|
27
|
+
- COPYING.txt
|
28
|
+
- README.rdoc
|
29
|
+
- bin/uhferret
|
30
|
+
- bin/uhferret-server
|
31
|
+
- ext/document.cpp
|
32
|
+
- ext/document.h
|
33
|
+
- ext/documentlist.cpp
|
34
|
+
- ext/documentlist.h
|
35
|
+
- ext/extconf.rb
|
36
|
+
- ext/tokenreader.cpp
|
37
|
+
- ext/tokenreader.h
|
38
|
+
- ext/tokenset.cpp
|
39
|
+
- ext/tokenset.h
|
40
|
+
- ext/tupleset.cpp
|
41
|
+
- ext/tupleset.h
|
42
|
+
- ext/uhferret_lib_wrap.cxx
|
43
|
+
- lib/uhferret.rb
|
44
|
+
- lib/utils.rb
|
45
|
+
- lib/webferret.rb
|
46
|
+
homepage: https://peterlane.netlify.app/ferret/
|
47
|
+
licenses:
|
48
|
+
- GPL-3.0+
|
49
|
+
metadata: {}
|
50
|
+
post_install_message:
|
51
|
+
rdoc_options:
|
52
|
+
- "-m"
|
53
|
+
- README.rdoc
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '2.5'
|
61
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
requirements: []
|
67
|
+
rubygems_version: 3.1.4
|
68
|
+
signing_key:
|
69
|
+
specification_version: 4
|
70
|
+
summary: UHFerret is a copy-detection tool.
|
71
|
+
test_files: []
|