lbp 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +31 -0
- data/Rakefile +8 -0
- data/bin/lbp +115 -0
- data/lbp.gemspec +36 -0
- data/lib/lbp.rb +11 -0
- data/lib/lbp/collection.rb +131 -0
- data/lib/lbp/functions.rb +12 -0
- data/lib/lbp/item.rb +153 -0
- data/lib/lbp/item_group.rb +52 -0
- data/lib/lbp/paragraph.rb +87 -0
- data/lib/lbp/transcription.rb +249 -0
- data/lib/lbp/version.rb +3 -0
- data/spec/collection_spec.rb +60 -0
- data/spec/config_globals.rb +18 -0
- data/spec/item_group_spec.rb +39 -0
- data/spec/item_spec.rb +74 -0
- data/spec/paragraph_spec.rb +37 -0
- data/spec/spec_helper.rb +89 -0
- data/spec/transcription_spec.rb +120 -0
- metadata +218 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'rugged'
|
3
|
+
require 'lbp/functions'
|
4
|
+
require 'lbp/transcription'
|
5
|
+
|
6
|
+
module Lbp
|
7
|
+
class ItemGroup
|
8
|
+
attr_reader :igid
|
9
|
+
|
10
|
+
def initialize(projectfile, igid)
|
11
|
+
@igid = igid
|
12
|
+
@projectfile = projectfile
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
def items
|
17
|
+
file = Nokogiri::XML(File.read(@projectfile))
|
18
|
+
result = file.xpath("//div[@id='#{@igid}']//item/fileName/@filestem")
|
19
|
+
fs_array = result.map do |fs|
|
20
|
+
Item.new(@projectfile, fs.value)
|
21
|
+
end
|
22
|
+
return fs_array
|
23
|
+
end
|
24
|
+
def item(fs)
|
25
|
+
Item.new(@projectfile, fs)
|
26
|
+
end
|
27
|
+
def title
|
28
|
+
file = Nokogiri::XML(File.read(@projectfile))
|
29
|
+
result = file.xpath("//div[@id='#{@igid}']/head")
|
30
|
+
return result.text
|
31
|
+
end
|
32
|
+
def has_sub_group?
|
33
|
+
file = Nokogiri::XML(File.read(@projectfile))
|
34
|
+
result = file.xpath("//div[@id='#{@igid}']//div")
|
35
|
+
if result.count == 0
|
36
|
+
false
|
37
|
+
else
|
38
|
+
true
|
39
|
+
end
|
40
|
+
end
|
41
|
+
def has_parent_group?
|
42
|
+
#I sort of hate this method. But it sort of works, though I can imagine problems.
|
43
|
+
file = Nokogiri::XML(File.read(@projectfile))
|
44
|
+
result = file.xpath("//div[@id='#{@igid}'][@class='toplevel']")
|
45
|
+
if result.count == 0
|
46
|
+
true
|
47
|
+
else
|
48
|
+
false
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'rugged'
|
3
|
+
require 'lbp/functions'
|
4
|
+
|
5
|
+
module Lbp
|
6
|
+
class Paragraph
|
7
|
+
attr_reader :pid
|
8
|
+
def initialize(projectfile, filehash, pid)
|
9
|
+
|
10
|
+
@projectfile = projectfile
|
11
|
+
@filehash = filehash
|
12
|
+
@pid = pid
|
13
|
+
|
14
|
+
@confighash = Collection.new(@projectfile)
|
15
|
+
end
|
16
|
+
|
17
|
+
def number
|
18
|
+
transcr = Transcription.new(@projectfile, @filehash)
|
19
|
+
totalparagraphs = transcr.number_of_body_paragraphs
|
20
|
+
xmlobject = transcr.nokogiri
|
21
|
+
paragraphs_following = xmlobject.xpath("//tei:body//tei:p[preceding::tei:p[@xml:id='#{@pid}']]", 'tei' => 'http://www.tei-c.org/ns/1.0').count
|
22
|
+
paragraph_number = totalparagraphs - paragraphs_following
|
23
|
+
|
24
|
+
return paragraph_number
|
25
|
+
end
|
26
|
+
def next
|
27
|
+
xmlobject = Transcription.new(@projectfile, @filehash).nokogiri
|
28
|
+
nextpid = xmlobject.xpath("//tei:p[@xml:id='#{@pid}']/following::tei:p[1]/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
29
|
+
if nextpid.text == nil
|
30
|
+
return nil
|
31
|
+
else
|
32
|
+
return Paragraph.new(@projectfile, @filehash, nextpid.text)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
def previous
|
36
|
+
xmlobject = Transcription.new(@projectfile, @filehash).nokogiri
|
37
|
+
previouspid = xmlobject.xpath("//tei:p[@xml:id='#{@pid}']/preceding::tei:p[1]/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
38
|
+
if previouspid.empty?
|
39
|
+
return nil
|
40
|
+
else
|
41
|
+
return Paragraph.new(@projectfile, @filehash, previouspid.text)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
def transform(xsltfile, xslt_param_array=[])
|
45
|
+
result = Transcription.new(@projectfile, @filehash).transform(xsltfile, xslt_param_array)
|
46
|
+
p = result.xpath("//p[@id='#{@pid}']")
|
47
|
+
return p
|
48
|
+
end
|
49
|
+
def transform_plain_text(xslt_param_array=[])
|
50
|
+
# not that it could be slightly confusing that paragraph plain text uses the transform clean,
|
51
|
+
# because we still the basic paragraph elements in order to select the desired paragraph
|
52
|
+
result = Transcription.new(@projectfile, @filehash).transform_clean(xslt_param_array)
|
53
|
+
|
54
|
+
p = result.xpath("//p[@id='#{@pid}']")
|
55
|
+
return p
|
56
|
+
end
|
57
|
+
def word_count
|
58
|
+
plaintext = self.transform_plain_text
|
59
|
+
size = plaintext.text.split.size
|
60
|
+
end
|
61
|
+
def word_array
|
62
|
+
plaintext = self.transform_plain_text
|
63
|
+
word_array = plaintext.text.split
|
64
|
+
word_array.map!{ |word| word.downcase}
|
65
|
+
end
|
66
|
+
def word_frequency(sort='frequency', order='descending')
|
67
|
+
word_array = self.word_array
|
68
|
+
wf = Hash.new(0)
|
69
|
+
word_array.each { |word| wf[word] += 1 }
|
70
|
+
|
71
|
+
if sort == "frequency"
|
72
|
+
if order == "descending" # high to low
|
73
|
+
wf = wf.sort_by{|k,v| v}.reverse
|
74
|
+
elsif order == "ascending" # low to high
|
75
|
+
wf = wf.sort_by{|k,v| v}
|
76
|
+
end
|
77
|
+
elsif sort == "word"
|
78
|
+
if order == "descending" # z - a
|
79
|
+
wf = wf.sort_by{|k,v| k}.reverse
|
80
|
+
elsif order == "ascending" #a - z
|
81
|
+
wf = wf.sort_by{|k,v| k}
|
82
|
+
end
|
83
|
+
end
|
84
|
+
return wf.to_h
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,249 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'rugged'
|
3
|
+
require 'lbp/functions'
|
4
|
+
require 'lbp/item'
|
5
|
+
require 'open-uri'
|
6
|
+
|
7
|
+
module Lbp
|
8
|
+
class Transcription
|
9
|
+
attr_reader :fs, :type, :ed, :xslt_dir
|
10
|
+
|
11
|
+
def initialize(projectfile, filehash)
|
12
|
+
|
13
|
+
@filehash = filehash
|
14
|
+
@projectfile = projectfile
|
15
|
+
|
16
|
+
@fs = filehash[:fs]
|
17
|
+
@type = filehash[:type] # critical or documentary
|
18
|
+
@ed = filehash[:ed]
|
19
|
+
|
20
|
+
@confighash = Collection.new(@projectfile).confighash
|
21
|
+
@xslthash = @confighash[:xslt_dirs]
|
22
|
+
|
23
|
+
#xslt version needs to gathered from a method
|
24
|
+
xslt_version = nil
|
25
|
+
#for now its being set to nil because no documents currently declare it
|
26
|
+
|
27
|
+
if xslt_version == nil
|
28
|
+
@schema = @xslthash["default"]
|
29
|
+
else
|
30
|
+
@schema = @xslthash[xslt_version]
|
31
|
+
end
|
32
|
+
|
33
|
+
if @type == 'critical'
|
34
|
+
@xslt_dir = @schema[:critical]
|
35
|
+
elsif @type == 'documentary'
|
36
|
+
@xslt_dir = @schema[:documentary]
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
if @filehash[:source] == 'local'
|
41
|
+
item = Item.new(@projectfile, @fs)
|
42
|
+
@current_branch = item.git_current_branch
|
43
|
+
# the effort here is to only set instance variable when absolutely necessary
|
44
|
+
if @current_branch != @ed
|
45
|
+
@item = item
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
## Begin file path methods
|
50
|
+
# Returns the absolute path of the file requested
|
51
|
+
def file_path
|
52
|
+
@filehash[:path]
|
53
|
+
end
|
54
|
+
def file
|
55
|
+
|
56
|
+
file = open(self.file_path)
|
57
|
+
end
|
58
|
+
def nokogiri
|
59
|
+
xmldoc = Nokogiri::XML(self.file)
|
60
|
+
|
61
|
+
end
|
62
|
+
## End File Path Methods
|
63
|
+
### Item Header Extraction and Metadata Methods
|
64
|
+
def title
|
65
|
+
xmldoc = self.nokogiri
|
66
|
+
|
67
|
+
title = xmldoc.xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:titleStmt[1]/tei:title[1]", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
68
|
+
return title.text
|
69
|
+
end
|
70
|
+
def author
|
71
|
+
xmldoc = self.nokogiri
|
72
|
+
author = xmldoc.xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc/tei:titleStmt[1]/tei:author", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
73
|
+
return author.text
|
74
|
+
end
|
75
|
+
def editor
|
76
|
+
xmldoc = self.nokogiri
|
77
|
+
editor = xmldoc.xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc/tei:titleStmt[1]/tei:editor", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
78
|
+
return editor.text
|
79
|
+
end
|
80
|
+
def ed_no
|
81
|
+
xmldoc = self.nokogiri
|
82
|
+
ed_no = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:editionStmt[1]/tei:edition[1]/@n", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
83
|
+
return ed_no.value
|
84
|
+
end
|
85
|
+
def ed_date
|
86
|
+
xmldoc = self.nokogiri
|
87
|
+
ed_date = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:editionStmt[1]/tei:edition[1]/tei:date[1]/@when", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
88
|
+
return ed_date.value
|
89
|
+
end
|
90
|
+
def pub_date
|
91
|
+
xmldoc = self.nokogiri
|
92
|
+
pub_date = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:publicationStmt[1]/tei:date[1]/@when", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
93
|
+
return pub_date.value
|
94
|
+
end
|
95
|
+
def encoding_method
|
96
|
+
xmldoc = self.nokogiri
|
97
|
+
encoding_method = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:encodingDesc[1]/tei:variantEncoding[1]/@method", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
98
|
+
return encoding_method.value
|
99
|
+
end
|
100
|
+
def encoding_location
|
101
|
+
xmldoc = self.nokogiri
|
102
|
+
encoding_location = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:encodingDesc[1]/tei:variantEncoding[1]/@location", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
103
|
+
return encoding_location.value
|
104
|
+
end
|
105
|
+
def number_of_columns
|
106
|
+
xmldoc = self.nokogiri
|
107
|
+
test = xmldoc.xpath("//tei:pb", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
108
|
+
if @type == "critical"
|
109
|
+
number_of_columns = nil
|
110
|
+
elsif xmldoc.xpath("//tei:pb", 'tei' => 'http://www.tei-c.org/ns/1.0').count != 0
|
111
|
+
number_of_columns = 1
|
112
|
+
elsif xmldoc.xpath("//tei:cb", 'tei' => 'http://www.tei-c.org/ns/1.0').count != 0
|
113
|
+
number_of_columns = 2
|
114
|
+
end
|
115
|
+
return number_of_columns
|
116
|
+
end
|
117
|
+
|
118
|
+
=begin - I think these methods belong with the Item or ItemRepo Object
|
119
|
+
|
120
|
+
### End Header and Metadata Information Extraction Methods ###
|
121
|
+
### Begin GIT functions ###
|
122
|
+
def is_git_dir
|
123
|
+
gitpath = @file_dir + ".git"
|
124
|
+
|
125
|
+
if File.directory?(gitpath)
|
126
|
+
true
|
127
|
+
else
|
128
|
+
false
|
129
|
+
end
|
130
|
+
end
|
131
|
+
def git_branches
|
132
|
+
repo = Rugged::Repository.new(@file_dir)
|
133
|
+
branches = repo.branches.map { |branch| branch.name }
|
134
|
+
return branches
|
135
|
+
end
|
136
|
+
def git_current_branch
|
137
|
+
repo = Rugged::Repository.new(@file_dir)
|
138
|
+
current_branch = repo.head.name.gsub(%r!\Arefs/heads/(.*)\z!) { $1 }
|
139
|
+
return current_branch
|
140
|
+
end
|
141
|
+
def git_tags
|
142
|
+
repo = Rugged::Repository.new(@file_dir)
|
143
|
+
tags = repo.tags.map { |tag| tag.name }
|
144
|
+
return tags
|
145
|
+
end
|
146
|
+
#need test for this
|
147
|
+
def git_checkout(branch)
|
148
|
+
repo = Rugged::Repository.new(@file_dir)
|
149
|
+
repo.checkout(branch)
|
150
|
+
end
|
151
|
+
### End Git Methods ###
|
152
|
+
=end
|
153
|
+
### Begin transform (XSLT) methocs ###
|
154
|
+
def transform(xsltfile, xslt_param_array=[])
|
155
|
+
|
156
|
+
xmlfile = self.file_path
|
157
|
+
if @current_branch != @ed && @filehash[:source] == 'local'
|
158
|
+
@item.git_checkout(@ed)
|
159
|
+
doc = xslt_transform(xmlfile, xsltfile, xslt_param_array)
|
160
|
+
@item.git_checkout(@current_branch);
|
161
|
+
else
|
162
|
+
doc = xslt_transform(xmlfile, xsltfile, xslt_param_array)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def transform_main_view(xslt_param_array=[])
|
167
|
+
xsltfile=@xslt_dir + @schema[:main_view] # "text_display.xsl"
|
168
|
+
doc = self.transform(xsltfile, xslt_param_array=[])
|
169
|
+
end
|
170
|
+
def transform_index_view(xslt_param_array=[])
|
171
|
+
xsltfile=@xslt_dir + @schema[:index_view] # "text_display_index.xsl"
|
172
|
+
doc = self.transform( xsltfile, xslt_param_array=[])
|
173
|
+
end
|
174
|
+
def transform_clean(xslt_param_array=[])
|
175
|
+
xsltfile=@xslt_dir + @schema[:clean_view] # "clean_forStatistics.xsl"
|
176
|
+
doc = self.transform(xsltfile, xslt_param_array=[])
|
177
|
+
end
|
178
|
+
def transform_plain_text(xslt_param_array=[])
|
179
|
+
xsltfile=@xslt_dir + @schema[:plain_text] # "plaintext.xsl"
|
180
|
+
doc = self.transform(xsltfile, xslt_param_array=[])
|
181
|
+
end
|
182
|
+
def transform_toc(xslt_param_array=[])
|
183
|
+
xsltfile=@xslt_dir + @schema[:toc] # "lectio_outline.xsl"
|
184
|
+
doc = self.transform(xsltfile, xslt_param_array=[])
|
185
|
+
end
|
186
|
+
### End of Transformation Methods ###
|
187
|
+
### Begin Statistics Methods ###
|
188
|
+
def word_count
|
189
|
+
plaintext = self.transform_plain_text
|
190
|
+
size = plaintext.text.split.size
|
191
|
+
end
|
192
|
+
def word_array
|
193
|
+
plaintext = self.transform_plain_text
|
194
|
+
word_array = plaintext.text.split
|
195
|
+
word_array.map!{ |word| word.downcase}
|
196
|
+
end
|
197
|
+
def word_frequency(sort, order)
|
198
|
+
word_array = self.word_array
|
199
|
+
wf = Hash.new(0)
|
200
|
+
word_array.each { |word| wf[word] += 1 }
|
201
|
+
|
202
|
+
if sort == "frequency"
|
203
|
+
if order == "descending" # high to low
|
204
|
+
wf = wf.sort_by{|k,v| v}.reverse
|
205
|
+
elsif order == "ascending" # low to high
|
206
|
+
wf = wf.sort_by{|k,v| v}
|
207
|
+
end
|
208
|
+
elsif sort == "word"
|
209
|
+
if order == "descending" # z - a
|
210
|
+
wf = wf.sort_by{|k,v| k}.reverse
|
211
|
+
elsif order == "ascending" #a - z
|
212
|
+
wf = wf.sort_by{|k,v| k}
|
213
|
+
end
|
214
|
+
end
|
215
|
+
return wf.to_h
|
216
|
+
end
|
217
|
+
def number_of_body_paragraphs
|
218
|
+
if @current_branch != @ed && @filehash[:source] == 'local'
|
219
|
+
@item.git_checkout(@ed)
|
220
|
+
xmldoc = self.nokogiri
|
221
|
+
p = xmldoc.xpath("//tei:body//tei:p", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
222
|
+
@item.git_checkout(@current_branch);
|
223
|
+
else
|
224
|
+
xmldoc = self.nokogiri
|
225
|
+
p = xmldoc.xpath("//tei:body//tei:p", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
226
|
+
end
|
227
|
+
return p.count
|
228
|
+
end
|
229
|
+
def paragraphs
|
230
|
+
## it's not good to keep reusing this, git check out condition. Need a better solution
|
231
|
+
if @current_branch != @ed && @filehash[:source] == 'local'
|
232
|
+
@item.git_checkout(@ed)
|
233
|
+
xmldoc = self.nokogiri
|
234
|
+
paragraphs = xmldoc.xpath("//tei:body//tei:p/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
235
|
+
@item.git_checkout(@current_branch);
|
236
|
+
else
|
237
|
+
xmldoc = self.nokogiri
|
238
|
+
paragraphs = xmldoc.xpath("//tei:body//tei:p/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
239
|
+
end
|
240
|
+
|
241
|
+
paragraph_objects = paragraphs.map do |p| Paragraph.new(@projectfile, @filehash, p.value) end
|
242
|
+
|
243
|
+
return paragraph_objects
|
244
|
+
end
|
245
|
+
def paragraph(pid)
|
246
|
+
Paragraph.new(@projectfile, @filehash, pid)
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
data/lib/lbp/version.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'lbp'
|
3
|
+
require 'pry'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
describe 'collection object' do
|
7
|
+
require_relative "config_globals"
|
8
|
+
|
9
|
+
$collection_obj = Lbp::Collection.new($pg_projectfile)
|
10
|
+
|
11
|
+
it 'should get list of item filestems in sequenced array' do
|
12
|
+
result = $collection_obj.item_filestems
|
13
|
+
expect(result).to be_kind_of(Array)
|
14
|
+
end
|
15
|
+
it 'should get a list of item names in sequenced array' do
|
16
|
+
result = $collection_obj.item_titles
|
17
|
+
expect(result).to be_kind_of(Array)
|
18
|
+
end
|
19
|
+
it 'should return a hash of filestems and item names' do
|
20
|
+
result = $collection_obj.items_fs_title_hash
|
21
|
+
expect(result).to be_kind_of(Hash)
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
it 'should get list of item objects in an array' do
|
27
|
+
result = $collection_obj.items
|
28
|
+
#reunning result.first.title returns ERROR!!!
|
29
|
+
expect(result).to be_kind_of(Array)
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should return local texts dir' do
|
33
|
+
result = $collection_obj.local_texts_dir
|
34
|
+
expect(result).to be_kind_of(String)
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should return general repo directory' do
|
38
|
+
result = $collection_obj.git_repo
|
39
|
+
|
40
|
+
expect(result).to be_kind_of(String)
|
41
|
+
end
|
42
|
+
it 'should return citation lists directory' do
|
43
|
+
result = $collection_obj.citation_lists_dir
|
44
|
+
expect(result).to be_kind_of(String)
|
45
|
+
end
|
46
|
+
it 'should return xslt hash' do
|
47
|
+
result = $collection_obj.xslt_dirs
|
48
|
+
expect(result).to be_kind_of(Hash)
|
49
|
+
end
|
50
|
+
it 'should return a specific item object when a specific item group id is given' do
|
51
|
+
result = $collection_obj.item('lectio1')
|
52
|
+
expect(result).to be_kind_of(Lbp::Item)
|
53
|
+
end
|
54
|
+
it 'should return the title of a given collection specified in the project data file' do
|
55
|
+
result = $collection_obj.title
|
56
|
+
expect(result).to be_kind_of(String)
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
end
|