uhferret 1.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,246 @@
1
+ #--
2
+ # This file is part of uhferret.
3
+ #
4
+ # Author:: Peter Lane
5
+ # Copyright:: Copyright 2012, Peter Lane.
6
+ # License:: GPLv3
7
+ #
8
+ # uhferret is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # uhferret is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with uhferret. If not, see <http://www.gnu.org/licenses/>.
20
+
21
+ require 'find'
22
+ require 'uhferret'
23
+ require 'utils'
24
+ require 'webrick'
25
+ include WEBrick
26
+
27
+ @@next_upload = 0 # global variable, keeps track of number of uploads, for naming folders
28
+ @@next_report = 0 # global variable, keeps track of number of reports created
29
+
30
+ module UHFerret
31
+
32
+ # Displays a welcome page, providing a field to upload the zipped file.
33
+ # On pressing 'submit', runs Ferret and passes results to FerretResultsServlet.
34
+ class FerretHomeServlet < HTTPServlet::AbstractServlet
35
+
36
+ # Returns the 'welcome page' html.
37
+ def do_GET(req, res)
38
+ res['Content-Type'] = "text/html"
39
+ res.body = <<BODY
40
+ <html><body><h1>Ferret Server</h1>
41
+ <p>
42
+ Ferret is a tool for detecting copying in groups of documents,
43
+ and was created by the (now defunct) Plagiarism Detection Group,
44
+ University of Hertfordshire.
45
+ </p>
46
+ <form method="POST" enctype="multipart/form-data">
47
+ <p>Compressed file: <input type="file" name="data" size="40">
48
+ <p><input type="submit"/>
49
+ </p>
50
+ </form>
51
+
52
+ <hr><h2>Instructions for use</h2>
53
+ <p>
54
+ <ol>
55
+ <li>Construct a compressed folder of your files in a way suitable for
56
+ your own computer. The Ferret Server will handle a compressed
57
+ folder in one of the following forms:
58
+ #{Utils::CompressedFileExtensions.map {|ext| "<tt>.#{ext}</tt>"}.join(", ")}.
59
+ The files within it may be as:
60
+ <ul>
61
+ <li>plain text files</li>
62
+ #{if Utils.command_present?("abiword")
63
+ "<li>word-processed files (such as <tt>doc</tt> or <tt>rtf</tt> files)</li>"
64
+ else
65
+ ""
66
+ end
67
+ }
68
+ #{if Utils.command_present?("pdftotext")
69
+ "<li><tt>pdf</tt> documents</li>"
70
+ else
71
+ ""
72
+ end
73
+ }
74
+ </ul>
75
+ Files may contain natural language text or computer programs (C-type
76
+ languages).
77
+ </li>
78
+ <li>Use the 'Browse' button to select your compressed file.
79
+ </li>
80
+ <li>Once Ferret has finished analysing the documents, the display will show
81
+ a table of the top 100 results.
82
+ </li>
83
+ <li>Click on the 'view' link beside each pair to see a report of
84
+ the comparisons found in that pair of documents. Use the print option of
85
+ your browser to preserve a copy (e.g. using 'print to pdf').
86
+ </li>
87
+ </ol>
88
+ </p>
89
+ <hr><font size=-1>Ferret home page generated on: #{Time.now}<br />.
90
+ </font>
91
+ </body>
92
+ </html>
93
+ BODY
94
+ end
95
+
96
+ # Convenience method to check if a string ends with given ending.
97
+ def endsWith?(str, str_end)
98
+ return false if str.length < str_end.length
99
+ str[-str_end.length .. -1] == str_end
100
+ end
101
+
102
+ # Checks if given _filename_ is an example of a compressed file.
103
+ def isCompressedFile? filename
104
+ Utils::CompressedFileExtensions.any? {|e| endsWith?(filename, e) }
105
+ end
106
+
107
+ # If _filename_ names a known compressed file format, it is decompressed
108
+ # and deleted.
109
+ def decompress_file filename
110
+ if endsWith?(filename, "rar")
111
+ `unrar x #{filename}` if Utils.command_present? "unrar"
112
+ elsif endsWith?(filename, "tbz2") || endsWith?(filename, "tar.bz2")
113
+ `tar jxf #{filename}` if Utils.command_present? "tar"
114
+ elsif endsWith?(filename, "tgz") || endsWith?(filename, "tar.gz")
115
+ `tar zxf #{filename}` if Utils.command_present? "tar"
116
+ elsif endsWith?(filename, "zip")
117
+ `unzip #{filename}` if Utils.command_present? "unzip"
118
+ end
119
+ File.delete filename # remove the compressed folder
120
+ end
121
+
122
+ # find all files in given folder and add their names to a definitions file
123
+ # -- return true if files are text documents, or false if not
124
+ def create_file_definitions folder
125
+ text_files = true
126
+ Dir.chdir folder
127
+ files = []
128
+ Find.find(folder) do |filename|
129
+ next unless File.file?(filename) # ignore directories
130
+ files << filename
131
+ text_files = false if Utils.is_code?(filename)
132
+ end
133
+ # write the names of valid files into a definitions file
134
+ File.open("ferret-file-definitions.def", "w") do |defn_file|
135
+ files.each do |f|
136
+ defn_file.puts f if Utils.valid_document? f
137
+ end
138
+ end
139
+ return text_files
140
+ end
141
+
142
+ # this method is triggered when the user clicks on 'submit query'
143
+ def do_POST(req, res)
144
+ upload_dir = "Upload#{@@next_upload}" # create a unique folder for user's files
145
+ @@next_upload += 1
146
+
147
+ Dir.mkdir $base unless File.exists? $base
148
+ Dir.mkdir "#{$base}/#{upload_dir}"
149
+ upload_data = req.query["data"]
150
+ filename = upload_data.filename.gsub(' ', "-") # replace spaces
151
+ uploaded_file = "#{$base}/#{upload_dir}/#{filename}"
152
+ File.open(uploaded_file, "wb") do |file| # do the actual upload of the data
153
+ upload_data.each_data do |data|
154
+ file << data.to_s
155
+ end
156
+ end
157
+
158
+ # if uploaded file is a compressed file, then decompress and compute similarities
159
+ if isCompressedFile?(uploaded_file)
160
+ Dir.chdir "#{$base}/#{upload_dir}"
161
+ decompress_file File.basename(uploaded_file)
162
+ is_text = create_file_definitions Dir.pwd
163
+
164
+ # do the computation of similarities
165
+ # -- output to html table with given folder name, using file definition list
166
+ `#{FERRET} #{is_text ? "-t" : "-c"} -w -f ferret-file-definitions.def > results.html`
167
+ res['Content-Type'] = "text/html"
168
+ res.body = "<meta HTTP-EQUIV=\"REFRESH\" content=\"0; url=#{$base}/#{upload_dir}/results.html\">"
169
+ else
170
+ res['Content-Type'] = "text/html"
171
+ res.body = %{<html><body><h1>Error</h1>
172
+ <p>You did not submit a valid zip file.</p>
173
+ <p><a href="/ferret/home">Return to Ferret home page</a>.</p>
174
+ </body></html>}
175
+ end
176
+ end
177
+ end
178
+
179
+ # This servlet is triggered by a click on 'Download' link in report table
180
+ # It creates the xml report comparing two documents
181
+ class FerretReportServlet < HTTPServlet::AbstractServlet
182
+
183
+ # Handles the request to create a report in xml format.
184
+ def do_GET(req, res)
185
+ upload_dir = req.query['upload']
186
+ file1 = req.query['file1'].gsub("%20", "\ ")
187
+ file2 = req.query['file2'].gsub("%20", "\ ")
188
+ report_name = "#{upload_dir}/report#{@@next_report}.xml"
189
+ @@next_report += 1
190
+ Dir.chdir "#{upload_dir}"
191
+
192
+ `#{FERRET} #{Utils.is_code?(file1) ? "-c" : "-t"} -x "#{report_name}" "#{file1}" "#{file2}"`
193
+ write_style_sheet File.dirname(report_name)
194
+ res['Content-Type'] = "text/html"
195
+ res.body = "<meta HTTP-EQUIV=\"REFRESH\" content=\"0; url=#{report_name}\">"
196
+ end
197
+
198
+ private
199
+ def write_style_sheet dir
200
+ File.open("#{dir}/uhferret.xsl", "w") do |f|
201
+ f.puts <<STYLESHEET
202
+ <?xml version="1.0" encoding="ISO-8859-1"?>
203
+
204
+ <html xsl:version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns="http://www.w3.org/1999/xhtml">
205
+ <head>
206
+ <style> <!-- style sheet for document -->
207
+ h1 {background-color: #d0d0d0} <!-- add a background to make headings stand out -->
208
+ h2 {background-color: #d0d0d0}
209
+ .highlight {font-weight:bold; color:blue} <!-- highlighted text style -->
210
+ .normal {font-weight:normal} <!-- normal text style -->
211
+ </style>
212
+ </head>
213
+ <body>
214
+ <h1>UH-Ferret: Document comparison</h1>
215
+
216
+ <!-- display top-level information -->
217
+ <p>Common trigrams: <xsl:value-of select="uhferret/common-trigrams"/></p>
218
+ <p>Similarity: <xsl:value-of select="uhferret/similarity"/></p>
219
+
220
+ <!-- work through each document -->
221
+ <xsl:for-each select="uhferret/document">
222
+ <!-- display document-level information -->
223
+ <h2>Document: <xsl:value-of select="source"/></h2>
224
+ <p>Number of trigrams: <xsl:value-of select="num-trigrams"/></p>
225
+ <p>Containment in other document: <xsl:value-of select="containment"/></p>
226
+
227
+ <!-- work through each block in text, displaying as highlighted or normal -->
228
+ <pre>
229
+ <xsl:for-each select="text/block">
230
+ <xsl:if test="@text='copied'">
231
+ <span class="highlight"><xsl:value-of select="."/></span>
232
+ </xsl:if>
233
+ <xsl:if test="@text='normal'">
234
+ <span style="normal"><xsl:value-of select="."/></span>
235
+ </xsl:if>
236
+ </xsl:for-each>
237
+ </pre>
238
+ </xsl:for-each>
239
+ </body>
240
+ </html>
241
+ STYLESHEET
242
+ end
243
+ end
244
+ end
245
+
246
+ end
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: uhferret
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.3.7
5
+ platform: ruby
6
+ authors:
7
+ - Peter Lane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-11-10 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: "UHFerret is a copy-detection tool to analyse large sets of documents
14
+ \nto find pairs of documents with substantial amounts of lexical copying. \nDocuments
15
+ containing either natural language (e.g. English) or computer \nprograms (in C-family)
16
+ may be processed. \n"
17
+ email: peterlane@gmx.com
18
+ executables:
19
+ - uhferret
20
+ - uhferret-server
21
+ extensions:
22
+ - ext/extconf.rb
23
+ extra_rdoc_files:
24
+ - README.rdoc
25
+ - COPYING.txt
26
+ files:
27
+ - COPYING.txt
28
+ - README.rdoc
29
+ - bin/uhferret
30
+ - bin/uhferret-server
31
+ - ext/document.cpp
32
+ - ext/document.h
33
+ - ext/documentlist.cpp
34
+ - ext/documentlist.h
35
+ - ext/extconf.rb
36
+ - ext/tokenreader.cpp
37
+ - ext/tokenreader.h
38
+ - ext/tokenset.cpp
39
+ - ext/tokenset.h
40
+ - ext/tupleset.cpp
41
+ - ext/tupleset.h
42
+ - ext/uhferret_lib_wrap.cxx
43
+ - lib/uhferret.rb
44
+ - lib/utils.rb
45
+ - lib/webferret.rb
46
+ homepage: https://peterlane.netlify.app/ferret/
47
+ licenses:
48
+ - GPL-3.0+
49
+ metadata: {}
50
+ post_install_message:
51
+ rdoc_options:
52
+ - "-m"
53
+ - README.rdoc
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '2.5'
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ requirements: []
67
+ rubygems_version: 3.1.4
68
+ signing_key:
69
+ specification_version: 4
70
+ summary: UHFerret is a copy-detection tool.
71
+ test_files: []