lbp 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +31 -0
- data/Rakefile +8 -0
- data/bin/lbp +115 -0
- data/lbp.gemspec +36 -0
- data/lib/lbp.rb +11 -0
- data/lib/lbp/collection.rb +131 -0
- data/lib/lbp/functions.rb +12 -0
- data/lib/lbp/item.rb +153 -0
- data/lib/lbp/item_group.rb +52 -0
- data/lib/lbp/paragraph.rb +87 -0
- data/lib/lbp/transcription.rb +249 -0
- data/lib/lbp/version.rb +3 -0
- data/spec/collection_spec.rb +60 -0
- data/spec/config_globals.rb +18 -0
- data/spec/item_group_spec.rb +39 -0
- data/spec/item_spec.rb +74 -0
- data/spec/paragraph_spec.rb +37 -0
- data/spec/spec_helper.rb +89 -0
- data/spec/transcription_spec.rb +120 -0
- metadata +218 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'rugged'
|
3
|
+
require 'lbp/functions'
|
4
|
+
require 'lbp/transcription'
|
5
|
+
|
6
|
+
module Lbp
|
7
|
+
class ItemGroup
|
8
|
+
attr_reader :igid
|
9
|
+
|
10
|
+
def initialize(projectfile, igid)
|
11
|
+
@igid = igid
|
12
|
+
@projectfile = projectfile
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
def items
|
17
|
+
file = Nokogiri::XML(File.read(@projectfile))
|
18
|
+
result = file.xpath("//div[@id='#{@igid}']//item/fileName/@filestem")
|
19
|
+
fs_array = result.map do |fs|
|
20
|
+
Item.new(@projectfile, fs.value)
|
21
|
+
end
|
22
|
+
return fs_array
|
23
|
+
end
|
24
|
+
def item(fs)
|
25
|
+
Item.new(@projectfile, fs)
|
26
|
+
end
|
27
|
+
def title
|
28
|
+
file = Nokogiri::XML(File.read(@projectfile))
|
29
|
+
result = file.xpath("//div[@id='#{@igid}']/head")
|
30
|
+
return result.text
|
31
|
+
end
|
32
|
+
def has_sub_group?
|
33
|
+
file = Nokogiri::XML(File.read(@projectfile))
|
34
|
+
result = file.xpath("//div[@id='#{@igid}']//div")
|
35
|
+
if result.count == 0
|
36
|
+
false
|
37
|
+
else
|
38
|
+
true
|
39
|
+
end
|
40
|
+
end
|
41
|
+
def has_parent_group?
|
42
|
+
#I sort of hate this method. But it sort of works, though I can imagine problems.
|
43
|
+
file = Nokogiri::XML(File.read(@projectfile))
|
44
|
+
result = file.xpath("//div[@id='#{@igid}'][@class='toplevel']")
|
45
|
+
if result.count == 0
|
46
|
+
true
|
47
|
+
else
|
48
|
+
false
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'rugged'
|
3
|
+
require 'lbp/functions'
|
4
|
+
|
5
|
+
module Lbp
|
6
|
+
class Paragraph
|
7
|
+
attr_reader :pid
|
8
|
+
def initialize(projectfile, filehash, pid)
|
9
|
+
|
10
|
+
@projectfile = projectfile
|
11
|
+
@filehash = filehash
|
12
|
+
@pid = pid
|
13
|
+
|
14
|
+
@confighash = Collection.new(@projectfile)
|
15
|
+
end
|
16
|
+
|
17
|
+
def number
|
18
|
+
transcr = Transcription.new(@projectfile, @filehash)
|
19
|
+
totalparagraphs = transcr.number_of_body_paragraphs
|
20
|
+
xmlobject = transcr.nokogiri
|
21
|
+
paragraphs_following = xmlobject.xpath("//tei:body//tei:p[preceding::tei:p[@xml:id='#{@pid}']]", 'tei' => 'http://www.tei-c.org/ns/1.0').count
|
22
|
+
paragraph_number = totalparagraphs - paragraphs_following
|
23
|
+
|
24
|
+
return paragraph_number
|
25
|
+
end
|
26
|
+
def next
|
27
|
+
xmlobject = Transcription.new(@projectfile, @filehash).nokogiri
|
28
|
+
nextpid = xmlobject.xpath("//tei:p[@xml:id='#{@pid}']/following::tei:p[1]/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
29
|
+
if nextpid.text == nil
|
30
|
+
return nil
|
31
|
+
else
|
32
|
+
return Paragraph.new(@projectfile, @filehash, nextpid.text)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
def previous
|
36
|
+
xmlobject = Transcription.new(@projectfile, @filehash).nokogiri
|
37
|
+
previouspid = xmlobject.xpath("//tei:p[@xml:id='#{@pid}']/preceding::tei:p[1]/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
38
|
+
if previouspid.empty?
|
39
|
+
return nil
|
40
|
+
else
|
41
|
+
return Paragraph.new(@projectfile, @filehash, previouspid.text)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
def transform(xsltfile, xslt_param_array=[])
|
45
|
+
result = Transcription.new(@projectfile, @filehash).transform(xsltfile, xslt_param_array)
|
46
|
+
p = result.xpath("//p[@id='#{@pid}']")
|
47
|
+
return p
|
48
|
+
end
|
49
|
+
def transform_plain_text(xslt_param_array=[])
|
50
|
+
# not that it could be slightly confusing that paragraph plain text uses the transform clean,
|
51
|
+
# because we still the basic paragraph elements in order to select the desired paragraph
|
52
|
+
result = Transcription.new(@projectfile, @filehash).transform_clean(xslt_param_array)
|
53
|
+
|
54
|
+
p = result.xpath("//p[@id='#{@pid}']")
|
55
|
+
return p
|
56
|
+
end
|
57
|
+
def word_count
|
58
|
+
plaintext = self.transform_plain_text
|
59
|
+
size = plaintext.text.split.size
|
60
|
+
end
|
61
|
+
def word_array
|
62
|
+
plaintext = self.transform_plain_text
|
63
|
+
word_array = plaintext.text.split
|
64
|
+
word_array.map!{ |word| word.downcase}
|
65
|
+
end
|
66
|
+
def word_frequency(sort='frequency', order='descending')
|
67
|
+
word_array = self.word_array
|
68
|
+
wf = Hash.new(0)
|
69
|
+
word_array.each { |word| wf[word] += 1 }
|
70
|
+
|
71
|
+
if sort == "frequency"
|
72
|
+
if order == "descending" # high to low
|
73
|
+
wf = wf.sort_by{|k,v| v}.reverse
|
74
|
+
elsif order == "ascending" # low to high
|
75
|
+
wf = wf.sort_by{|k,v| v}
|
76
|
+
end
|
77
|
+
elsif sort == "word"
|
78
|
+
if order == "descending" # z - a
|
79
|
+
wf = wf.sort_by{|k,v| k}.reverse
|
80
|
+
elsif order == "ascending" #a - z
|
81
|
+
wf = wf.sort_by{|k,v| k}
|
82
|
+
end
|
83
|
+
end
|
84
|
+
return wf.to_h
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,249 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'rugged'
|
3
|
+
require 'lbp/functions'
|
4
|
+
require 'lbp/item'
|
5
|
+
require 'open-uri'
|
6
|
+
|
7
|
+
module Lbp
|
8
|
+
class Transcription
|
9
|
+
attr_reader :fs, :type, :ed, :xslt_dir
|
10
|
+
|
11
|
+
def initialize(projectfile, filehash)
|
12
|
+
|
13
|
+
@filehash = filehash
|
14
|
+
@projectfile = projectfile
|
15
|
+
|
16
|
+
@fs = filehash[:fs]
|
17
|
+
@type = filehash[:type] # critical or documentary
|
18
|
+
@ed = filehash[:ed]
|
19
|
+
|
20
|
+
@confighash = Collection.new(@projectfile).confighash
|
21
|
+
@xslthash = @confighash[:xslt_dirs]
|
22
|
+
|
23
|
+
#xslt version needs to gathered from a method
|
24
|
+
xslt_version = nil
|
25
|
+
#for now its being set to nil because no documents currently declare it
|
26
|
+
|
27
|
+
if xslt_version == nil
|
28
|
+
@schema = @xslthash["default"]
|
29
|
+
else
|
30
|
+
@schema = @xslthash[xslt_version]
|
31
|
+
end
|
32
|
+
|
33
|
+
if @type == 'critical'
|
34
|
+
@xslt_dir = @schema[:critical]
|
35
|
+
elsif @type == 'documentary'
|
36
|
+
@xslt_dir = @schema[:documentary]
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
if @filehash[:source] == 'local'
|
41
|
+
item = Item.new(@projectfile, @fs)
|
42
|
+
@current_branch = item.git_current_branch
|
43
|
+
# the effort here is to only set instance variable when absolutely necessary
|
44
|
+
if @current_branch != @ed
|
45
|
+
@item = item
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
## Begin file path methods
|
50
|
+
# Returns the absolute path of the file requested
|
51
|
+
def file_path
|
52
|
+
@filehash[:path]
|
53
|
+
end
|
54
|
+
def file
|
55
|
+
|
56
|
+
file = open(self.file_path)
|
57
|
+
end
|
58
|
+
def nokogiri
|
59
|
+
xmldoc = Nokogiri::XML(self.file)
|
60
|
+
|
61
|
+
end
|
62
|
+
## End File Path Methods
|
63
|
+
### Item Header Extraction and Metadata Methods
|
64
|
+
def title
|
65
|
+
xmldoc = self.nokogiri
|
66
|
+
|
67
|
+
title = xmldoc.xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:titleStmt[1]/tei:title[1]", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
68
|
+
return title.text
|
69
|
+
end
|
70
|
+
def author
|
71
|
+
xmldoc = self.nokogiri
|
72
|
+
author = xmldoc.xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc/tei:titleStmt[1]/tei:author", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
73
|
+
return author.text
|
74
|
+
end
|
75
|
+
def editor
|
76
|
+
xmldoc = self.nokogiri
|
77
|
+
editor = xmldoc.xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc/tei:titleStmt[1]/tei:editor", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
78
|
+
return editor.text
|
79
|
+
end
|
80
|
+
def ed_no
|
81
|
+
xmldoc = self.nokogiri
|
82
|
+
ed_no = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:editionStmt[1]/tei:edition[1]/@n", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
83
|
+
return ed_no.value
|
84
|
+
end
|
85
|
+
def ed_date
|
86
|
+
xmldoc = self.nokogiri
|
87
|
+
ed_date = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:editionStmt[1]/tei:edition[1]/tei:date[1]/@when", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
88
|
+
return ed_date.value
|
89
|
+
end
|
90
|
+
def pub_date
|
91
|
+
xmldoc = self.nokogiri
|
92
|
+
pub_date = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:publicationStmt[1]/tei:date[1]/@when", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
93
|
+
return pub_date.value
|
94
|
+
end
|
95
|
+
def encoding_method
|
96
|
+
xmldoc = self.nokogiri
|
97
|
+
encoding_method = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:encodingDesc[1]/tei:variantEncoding[1]/@method", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
98
|
+
return encoding_method.value
|
99
|
+
end
|
100
|
+
def encoding_location
|
101
|
+
xmldoc = self.nokogiri
|
102
|
+
encoding_location = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:encodingDesc[1]/tei:variantEncoding[1]/@location", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
103
|
+
return encoding_location.value
|
104
|
+
end
|
105
|
+
def number_of_columns
|
106
|
+
xmldoc = self.nokogiri
|
107
|
+
test = xmldoc.xpath("//tei:pb", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
108
|
+
if @type == "critical"
|
109
|
+
number_of_columns = nil
|
110
|
+
elsif xmldoc.xpath("//tei:pb", 'tei' => 'http://www.tei-c.org/ns/1.0').count != 0
|
111
|
+
number_of_columns = 1
|
112
|
+
elsif xmldoc.xpath("//tei:cb", 'tei' => 'http://www.tei-c.org/ns/1.0').count != 0
|
113
|
+
number_of_columns = 2
|
114
|
+
end
|
115
|
+
return number_of_columns
|
116
|
+
end
|
117
|
+
|
118
|
+
=begin - I think these methods belong with the Item or ItemRepo Object
|
119
|
+
|
120
|
+
### End Header and Metadata Information Extraction Methods ###
|
121
|
+
### Begin GIT functions ###
|
122
|
+
def is_git_dir
|
123
|
+
gitpath = @file_dir + ".git"
|
124
|
+
|
125
|
+
if File.directory?(gitpath)
|
126
|
+
true
|
127
|
+
else
|
128
|
+
false
|
129
|
+
end
|
130
|
+
end
|
131
|
+
def git_branches
|
132
|
+
repo = Rugged::Repository.new(@file_dir)
|
133
|
+
branches = repo.branches.map { |branch| branch.name }
|
134
|
+
return branches
|
135
|
+
end
|
136
|
+
def git_current_branch
|
137
|
+
repo = Rugged::Repository.new(@file_dir)
|
138
|
+
current_branch = repo.head.name.gsub(%r!\Arefs/heads/(.*)\z!) { $1 }
|
139
|
+
return current_branch
|
140
|
+
end
|
141
|
+
def git_tags
|
142
|
+
repo = Rugged::Repository.new(@file_dir)
|
143
|
+
tags = repo.tags.map { |tag| tag.name }
|
144
|
+
return tags
|
145
|
+
end
|
146
|
+
#need test for this
|
147
|
+
def git_checkout(branch)
|
148
|
+
repo = Rugged::Repository.new(@file_dir)
|
149
|
+
repo.checkout(branch)
|
150
|
+
end
|
151
|
+
### End Git Methods ###
|
152
|
+
=end
|
153
|
+
### Begin transform (XSLT) methocs ###
|
154
|
+
def transform(xsltfile, xslt_param_array=[])
|
155
|
+
|
156
|
+
xmlfile = self.file_path
|
157
|
+
if @current_branch != @ed && @filehash[:source] == 'local'
|
158
|
+
@item.git_checkout(@ed)
|
159
|
+
doc = xslt_transform(xmlfile, xsltfile, xslt_param_array)
|
160
|
+
@item.git_checkout(@current_branch);
|
161
|
+
else
|
162
|
+
doc = xslt_transform(xmlfile, xsltfile, xslt_param_array)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def transform_main_view(xslt_param_array=[])
|
167
|
+
xsltfile=@xslt_dir + @schema[:main_view] # "text_display.xsl"
|
168
|
+
doc = self.transform(xsltfile, xslt_param_array=[])
|
169
|
+
end
|
170
|
+
def transform_index_view(xslt_param_array=[])
|
171
|
+
xsltfile=@xslt_dir + @schema[:index_view] # "text_display_index.xsl"
|
172
|
+
doc = self.transform( xsltfile, xslt_param_array=[])
|
173
|
+
end
|
174
|
+
def transform_clean(xslt_param_array=[])
|
175
|
+
xsltfile=@xslt_dir + @schema[:clean_view] # "clean_forStatistics.xsl"
|
176
|
+
doc = self.transform(xsltfile, xslt_param_array=[])
|
177
|
+
end
|
178
|
+
def transform_plain_text(xslt_param_array=[])
|
179
|
+
xsltfile=@xslt_dir + @schema[:plain_text] # "plaintext.xsl"
|
180
|
+
doc = self.transform(xsltfile, xslt_param_array=[])
|
181
|
+
end
|
182
|
+
def transform_toc(xslt_param_array=[])
|
183
|
+
xsltfile=@xslt_dir + @schema[:toc] # "lectio_outline.xsl"
|
184
|
+
doc = self.transform(xsltfile, xslt_param_array=[])
|
185
|
+
end
|
186
|
+
### End of Transformation Methods ###
|
187
|
+
### Begin Statistics Methods ###
|
188
|
+
def word_count
|
189
|
+
plaintext = self.transform_plain_text
|
190
|
+
size = plaintext.text.split.size
|
191
|
+
end
|
192
|
+
def word_array
|
193
|
+
plaintext = self.transform_plain_text
|
194
|
+
word_array = plaintext.text.split
|
195
|
+
word_array.map!{ |word| word.downcase}
|
196
|
+
end
|
197
|
+
def word_frequency(sort, order)
|
198
|
+
word_array = self.word_array
|
199
|
+
wf = Hash.new(0)
|
200
|
+
word_array.each { |word| wf[word] += 1 }
|
201
|
+
|
202
|
+
if sort == "frequency"
|
203
|
+
if order == "descending" # high to low
|
204
|
+
wf = wf.sort_by{|k,v| v}.reverse
|
205
|
+
elsif order == "ascending" # low to high
|
206
|
+
wf = wf.sort_by{|k,v| v}
|
207
|
+
end
|
208
|
+
elsif sort == "word"
|
209
|
+
if order == "descending" # z - a
|
210
|
+
wf = wf.sort_by{|k,v| k}.reverse
|
211
|
+
elsif order == "ascending" #a - z
|
212
|
+
wf = wf.sort_by{|k,v| k}
|
213
|
+
end
|
214
|
+
end
|
215
|
+
return wf.to_h
|
216
|
+
end
|
217
|
+
def number_of_body_paragraphs
|
218
|
+
if @current_branch != @ed && @filehash[:source] == 'local'
|
219
|
+
@item.git_checkout(@ed)
|
220
|
+
xmldoc = self.nokogiri
|
221
|
+
p = xmldoc.xpath("//tei:body//tei:p", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
222
|
+
@item.git_checkout(@current_branch);
|
223
|
+
else
|
224
|
+
xmldoc = self.nokogiri
|
225
|
+
p = xmldoc.xpath("//tei:body//tei:p", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
226
|
+
end
|
227
|
+
return p.count
|
228
|
+
end
|
229
|
+
def paragraphs
|
230
|
+
## it's not good to keep reusing this, git check out condition. Need a better solution
|
231
|
+
if @current_branch != @ed && @filehash[:source] == 'local'
|
232
|
+
@item.git_checkout(@ed)
|
233
|
+
xmldoc = self.nokogiri
|
234
|
+
paragraphs = xmldoc.xpath("//tei:body//tei:p/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
235
|
+
@item.git_checkout(@current_branch);
|
236
|
+
else
|
237
|
+
xmldoc = self.nokogiri
|
238
|
+
paragraphs = xmldoc.xpath("//tei:body//tei:p/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
|
239
|
+
end
|
240
|
+
|
241
|
+
paragraph_objects = paragraphs.map do |p| Paragraph.new(@projectfile, @filehash, p.value) end
|
242
|
+
|
243
|
+
return paragraph_objects
|
244
|
+
end
|
245
|
+
def paragraph(pid)
|
246
|
+
Paragraph.new(@projectfile, @filehash, pid)
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
data/lib/lbp/version.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'lbp'
|
3
|
+
require 'pry'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
describe 'collection object' do
|
7
|
+
require_relative "config_globals"
|
8
|
+
|
9
|
+
$collection_obj = Lbp::Collection.new($pg_projectfile)
|
10
|
+
|
11
|
+
it 'should get list of item filestems in sequenced array' do
|
12
|
+
result = $collection_obj.item_filestems
|
13
|
+
expect(result).to be_kind_of(Array)
|
14
|
+
end
|
15
|
+
it 'should get a list of item names in sequenced array' do
|
16
|
+
result = $collection_obj.item_titles
|
17
|
+
expect(result).to be_kind_of(Array)
|
18
|
+
end
|
19
|
+
it 'should return a hash of filestems and item names' do
|
20
|
+
result = $collection_obj.items_fs_title_hash
|
21
|
+
expect(result).to be_kind_of(Hash)
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
it 'should get list of item objects in an array' do
|
27
|
+
result = $collection_obj.items
|
28
|
+
#reunning result.first.title returns ERROR!!!
|
29
|
+
expect(result).to be_kind_of(Array)
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should return local texts dir' do
|
33
|
+
result = $collection_obj.local_texts_dir
|
34
|
+
expect(result).to be_kind_of(String)
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should return general repo directory' do
|
38
|
+
result = $collection_obj.git_repo
|
39
|
+
|
40
|
+
expect(result).to be_kind_of(String)
|
41
|
+
end
|
42
|
+
it 'should return citation lists directory' do
|
43
|
+
result = $collection_obj.citation_lists_dir
|
44
|
+
expect(result).to be_kind_of(String)
|
45
|
+
end
|
46
|
+
it 'should return xslt hash' do
|
47
|
+
result = $collection_obj.xslt_dirs
|
48
|
+
expect(result).to be_kind_of(Hash)
|
49
|
+
end
|
50
|
+
it 'should return a specific item object when a specific item group id is given' do
|
51
|
+
result = $collection_obj.item('lectio1')
|
52
|
+
expect(result).to be_kind_of(Lbp::Item)
|
53
|
+
end
|
54
|
+
it 'should return the title of a given collection specified in the project data file' do
|
55
|
+
result = $collection_obj.title
|
56
|
+
expect(result).to be_kind_of(String)
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
end
|