uhferret 1.3.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,246 @@
1
+ #--
2
+ # This file is part of uhferret.
3
+ #
4
+ # Author:: Peter Lane
5
+ # Copyright:: Copyright 2012, Peter Lane.
6
+ # License:: GPLv3
7
+ #
8
+ # uhferret is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # uhferret is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ require 'find'
22
+ require 'uhferret'
23
+ require 'utils'
24
+ require 'webrick'
25
+ include WEBrick
26
+
27
+ @@next_upload = 0 # global variable, keeps track of number of uploads, for naming folders
28
+ @@next_report = 0 # global variable, keeps track of number of reports created
29
+
30
+ module UHFerret
31
+
32
+ # Displays a welcome page, providing a field to upload the zipped file.
33
+ # On pressing 'submit', runs Ferret and passes results to FerretResultsServlet.
34
+ class FerretHomeServlet < HTTPServlet::AbstractServlet
35
+
36
+ # Returns the 'welcome page' html.
37
+ def do_GET(req, res)
38
+ res['Content-Type'] = "text/html"
39
+ res.body = <<BODY
40
+ <html><body><h1>Ferret Server</h1>
41
+ <p>
42
+ Ferret is a tool for detecting copying in groups of documents,
43
+ and was created by the (now defunct) Plagiarism Detection Group,
44
+ University of Hertfordshire.
45
+ </p>
46
+ <form method="POST" enctype="multipart/form-data">
47
+ <p>Compressed file: <input type="file" name="data" size="40">
48
+ <p><input type="submit"/>
49
+ </p>
50
+ </form>
51
+
52
+ <hr><h2>Instructions for use</h2>
53
+ <p>
54
+ <ol>
55
+ <li>Construct a compressed folder of your files in a way suitable for
56
+ your own computer. The Ferret Server will handle a compressed
57
+ folder in one of the following forms:
58
+ #{Utils::CompressedFileExtensions.map {|ext| "<tt>.#{ext}</tt>"}.join(", ")}.
59
+ The files within it may be as:
60
+ <ul>
61
+ <li>plain text files</li>
62
+ #{if Utils.command_present?("abiword")
63
+ "<li>word-processed files (such as <tt>doc</tt> or <tt>rtf</tt> files)</li>"
64
+ else
65
+ ""
66
+ end
67
+ }
68
+ #{if Utils.command_present?("pdftotext")
69
+ "<li><tt>pdf</tt> documents</li>"
70
+ else
71
+ ""
72
+ end
73
+ }
74
+ </ul>
75
+ Files may contain natural language text or computer programs (C-type
76
+ languages).
77
+ </li>
78
+ <li>Use the 'Browse' button to select your compressed file.
79
+ </li>
80
+ <li>Once Ferret has finished analysing the documents, the display will show
81
+ a table of the top 100 results.
82
+ </li>
83
+ <li>Click on the 'view' link beside each pair to see a report of
84
+ the comparisons found in that pair of documents. Use the print option of
85
+ your browser to preserve a copy (e.g. using 'print to pdf').
86
+ </li>
87
+ </ol>
88
+ </p>
89
+ <hr><font size=-1>Ferret home page generated on: #{Time.now}<br />.
90
+ </font>
91
+ </body>
92
+ </html>
93
+ BODY
94
+ end
95
+
96
+ # Convenience method to check if a string ends with given ending.
97
+ def endsWith?(str, str_end)
98
+ return false if str.length < str_end.length
99
+ str[-str_end.length .. -1] == str_end
100
+ end
101
+
102
+ # Checks if given _filename_ is an example of a compressed file.
103
+ def isCompressedFile? filename
104
+ Utils::CompressedFileExtensions.any? {|e| endsWith?(filename, e) }
105
+ end
106
+
107
+ # If _filename_ names a known compressed file format, it is decompressed
108
+ # and deleted.
109
+ def decompress_file filename
110
+ if endsWith?(filename, "rar")
111
+ `unrar x #{filename}` if Utils.command_present? "unrar"
112
+ elsif endsWith?(filename, "tbz2") || endsWith?(filename, "tar.bz2")
113
+ `tar jxf #{filename}` if Utils.command_present? "tar"
114
+ elsif endsWith?(filename, "tgz") || endsWith?(filename, "tar.gz")
115
+ `tar zxf #{filename}` if Utils.command_present? "tar"
116
+ elsif endsWith?(filename, "zip")
117
+ `unzip #{filename}` if Utils.command_present? "unzip"
118
+ end
119
+ File.delete filename # remove the compressed folder
120
+ end
121
+
122
+ # find all files in given folder and add their names to a definitions file
123
+ # -- return true if files are text documents, or false if not
124
+ def create_file_definitions folder
125
+ text_files = true
126
+ Dir.chdir folder
127
+ files = []
128
+ Find.find(folder) do |filename|
129
+ next unless File.file?(filename) # ignore directories
130
+ files << filename
131
+ text_files = false if Utils.is_code?(filename)
132
+ end
133
+ # write the names of valid files into a definitions file
134
+ File.open("ferret-file-definitions.def", "w") do |defn_file|
135
+ files.each do |f|
136
+ defn_file.puts f if Utils.valid_document? f
137
+ end
138
+ end
139
+ return text_files
140
+ end
141
+
142
+ # this method is triggered when the user clicks on 'submit query'
143
+ def do_POST(req, res)
144
+ upload_dir = "Upload#{@@next_upload}" # create a unique folder for user's files
145
+ @@next_upload += 1
146
+
147
+ Dir.mkdir $base unless File.exists? $base
148
+ Dir.mkdir "#{$base}/#{upload_dir}"
149
+ upload_data = req.query["data"]
150
+ filename = upload_data.filename.gsub(' ', "-") # replace spaces
151
+ uploaded_file = "#{$base}/#{upload_dir}/#{filename}"
152
+ File.open(uploaded_file, "wb") do |file| # do the actual upload of the data
153
+ upload_data.each_data do |data|
154
+ file << data.to_s
155
+ end
156
+ end
157
+
158
+ # if uploaded file is a compressed file, then decompress and compute similarities
159
+ if isCompressedFile?(uploaded_file)
160
+ Dir.chdir "#{$base}/#{upload_dir}"
161
+ decompress_file File.basename(uploaded_file)
162
+ is_text = create_file_definitions Dir.pwd
163
+
164
+ # do the computation of similarities
165
+ # -- output to html table with given folder name, using file definition list
166
+ `#{FERRET} #{is_text ? "-t" : "-c"} -w -f ferret-file-definitions.def > results.html`
167
+ res['Content-Type'] = "text/html"
168
+ res.body = "<meta HTTP-EQUIV=\"REFRESH\" content=\"0; url=#{$base}/#{upload_dir}/results.html\">"
169
+ else
170
+ res['Content-Type'] = "text/html"
171
+ res.body = %{<html><body><h1>Error</h1>
172
+ <p>You did not submit a valid zip file.</p>
173
+ <p><a href="/ferret/home">Return to Ferret home page</a>.</p>
174
+ </body></html>}
175
+ end
176
+ end
177
+ end
178
+
179
+ # This servlet is triggered by a click on 'Download' link in report table
180
+ # It creates the xml report comparing two documents
181
+ class FerretReportServlet < HTTPServlet::AbstractServlet
182
+
183
+ # Handles the request to create a report in xml format.
184
+ def do_GET(req, res)
185
+ upload_dir = req.query['upload']
186
+ file1 = req.query['file1'].gsub("%20", "\ ")
187
+ file2 = req.query['file2'].gsub("%20", "\ ")
188
+ report_name = "#{upload_dir}/report#{@@next_report}.xml"
189
+ @@next_report += 1
190
+ Dir.chdir "#{upload_dir}"
191
+
192
+ `#{FERRET} #{Utils.is_code?(file1) ? "-c" : "-t"} -x "#{report_name}" "#{file1}" "#{file2}"`
193
+ write_style_sheet File.dirname(report_name)
194
+ res['Content-Type'] = "text/html"
195
+ res.body = "<meta HTTP-EQUIV=\"REFRESH\" content=\"0; url=#{report_name}\">"
196
+ end
197
+
198
+ private
199
+ def write_style_sheet dir
200
+ File.open("#{dir}/uhferret.xsl", "w") do |f|
201
+ f.puts <<STYLESHEET
202
+ <?xml version="1.0" encoding="ISO-8859-1"?>
203
+
204
+ <html xsl:version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns="http://www.w3.org/1999/xhtml">
205
+ <head>
206
+ <style> <!-- style sheet for document -->
207
+ h1 {background-color: #d0d0d0} <!-- add a background to make headings stand out -->
208
+ h2 {background-color: #d0d0d0}
209
+ .highlight {font-weight:bold; color:blue} <!-- highlighted text style -->
210
+ .normal {font-weight:normal} <!-- normal text style -->
211
+ </style>
212
+ </head>
213
+ <body>
214
+ <h1>UH-Ferret: Document comparison</h1>
215
+
216
+ <!-- display top-level information -->
217
+ <p>Common trigrams: <xsl:value-of select="uhferret/common-trigrams"/></p>
218
+ <p>Similarity: <xsl:value-of select="uhferret/similarity"/></p>
219
+
220
+ <!-- work through each document -->
221
+ <xsl:for-each select="uhferret/document">
222
+ <!-- display document-level information -->
223
+ <h2>Document: <xsl:value-of select="source"/></h2>
224
+ <p>Number of trigrams: <xsl:value-of select="num-trigrams"/></p>
225
+ <p>Containment in other document: <xsl:value-of select="containment"/></p>
226
+
227
+ <!-- work through each block in text, displaying as highlighted or normal -->
228
+ <pre>
229
+ <xsl:for-each select="text/block">
230
+ <xsl:if test="@text='copied'">
231
+ <span class="highlight"><xsl:value-of select="."/></span>
232
+ </xsl:if>
233
+ <xsl:if test="@text='normal'">
234
+ <span style="normal"><xsl:value-of select="."/></span>
235
+ </xsl:if>
236
+ </xsl:for-each>
237
+ </pre>
238
+ </xsl:for-each>
239
+ </body>
240
+ </html>
241
+ STYLESHEET
242
+ end
243
+ end
244
+ end
245
+
246
+ end
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: uhferret
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.3.7
5
+ platform: ruby
6
+ authors:
7
+ - Peter Lane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-11-10 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: "UHFerret is a copy-detection tool to analyse large sets of documents
14
+ \nto find pairs of documents with substantial amounts of lexical copying. \nDocuments
15
+ containing either natural language (e.g. English) or computer \nprograms (in C-family)
16
+ may be processed. \n"
17
+ email: peterlane@gmx.com
18
+ executables:
19
+ - uhferret
20
+ - uhferret-server
21
+ extensions:
22
+ - ext/extconf.rb
23
+ extra_rdoc_files:
24
+ - README.rdoc
25
+ - COPYING.txt
26
+ files:
27
+ - COPYING.txt
28
+ - README.rdoc
29
+ - bin/uhferret
30
+ - bin/uhferret-server
31
+ - ext/document.cpp
32
+ - ext/document.h
33
+ - ext/documentlist.cpp
34
+ - ext/documentlist.h
35
+ - ext/extconf.rb
36
+ - ext/tokenreader.cpp
37
+ - ext/tokenreader.h
38
+ - ext/tokenset.cpp
39
+ - ext/tokenset.h
40
+ - ext/tupleset.cpp
41
+ - ext/tupleset.h
42
+ - ext/uhferret_lib_wrap.cxx
43
+ - lib/uhferret.rb
44
+ - lib/utils.rb
45
+ - lib/webferret.rb
46
+ homepage: https://peterlane.netlify.app/ferret/
47
+ licenses:
48
+ - GPL-3.0+
49
+ metadata: {}
50
+ post_install_message:
51
+ rdoc_options:
52
+ - "-m"
53
+ - README.rdoc
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '2.5'
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ requirements: []
67
+ rubygems_version: 3.1.4
68
+ signing_key:
69
+ specification_version: 4
70
+ summary: UHFerret is a copy-detection tool.
71
+ test_files: []