lbp 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,52 @@
1
+ require 'nokogiri'
2
+ require 'rugged'
3
+ require 'lbp/functions'
4
+ require 'lbp/transcription'
5
+
6
+ module Lbp
7
+ class ItemGroup
8
+ attr_reader :igid
9
+
10
+ def initialize(projectfile, igid)
11
+ @igid = igid
12
+ @projectfile = projectfile
13
+
14
+ end
15
+
16
+ def items
17
+ file = Nokogiri::XML(File.read(@projectfile))
18
+ result = file.xpath("//div[@id='#{@igid}']//item/fileName/@filestem")
19
+ fs_array = result.map do |fs|
20
+ Item.new(@projectfile, fs.value)
21
+ end
22
+ return fs_array
23
+ end
24
+ def item(fs)
25
+ Item.new(@projectfile, fs)
26
+ end
27
+ def title
28
+ file = Nokogiri::XML(File.read(@projectfile))
29
+ result = file.xpath("//div[@id='#{@igid}']/head")
30
+ return result.text
31
+ end
32
+ def has_sub_group?
33
+ file = Nokogiri::XML(File.read(@projectfile))
34
+ result = file.xpath("//div[@id='#{@igid}']//div")
35
+ if result.count == 0
36
+ false
37
+ else
38
+ true
39
+ end
40
+ end
41
+ def has_parent_group?
42
+ #I sort of hate this method. But it sort of works, though I can imagine problems.
43
+ file = Nokogiri::XML(File.read(@projectfile))
44
+ result = file.xpath("//div[@id='#{@igid}'][@class='toplevel']")
45
+ if result.count == 0
46
+ true
47
+ else
48
+ false
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,87 @@
1
+ require 'nokogiri'
2
+ require 'rugged'
3
+ require 'lbp/functions'
4
+
5
+ module Lbp
6
+ class Paragraph
7
+ attr_reader :pid
8
+ def initialize(projectfile, filehash, pid)
9
+
10
+ @projectfile = projectfile
11
+ @filehash = filehash
12
+ @pid = pid
13
+
14
+ @confighash = Collection.new(@projectfile)
15
+ end
16
+
17
+ def number
18
+ transcr = Transcription.new(@projectfile, @filehash)
19
+ totalparagraphs = transcr.number_of_body_paragraphs
20
+ xmlobject = transcr.nokogiri
21
+ paragraphs_following = xmlobject.xpath("//tei:body//tei:p[preceding::tei:p[@xml:id='#{@pid}']]", 'tei' => 'http://www.tei-c.org/ns/1.0').count
22
+ paragraph_number = totalparagraphs - paragraphs_following
23
+
24
+ return paragraph_number
25
+ end
26
+ def next
27
+ xmlobject = Transcription.new(@projectfile, @filehash).nokogiri
28
+ nextpid = xmlobject.xpath("//tei:p[@xml:id='#{@pid}']/following::tei:p[1]/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
29
+ if nextpid.text == nil
30
+ return nil
31
+ else
32
+ return Paragraph.new(@projectfile, @filehash, nextpid.text)
33
+ end
34
+ end
35
+ def previous
36
+ xmlobject = Transcription.new(@projectfile, @filehash).nokogiri
37
+ previouspid = xmlobject.xpath("//tei:p[@xml:id='#{@pid}']/preceding::tei:p[1]/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
38
+ if previouspid.empty?
39
+ return nil
40
+ else
41
+ return Paragraph.new(@projectfile, @filehash, previouspid.text)
42
+ end
43
+ end
44
+ def transform(xsltfile, xslt_param_array=[])
45
+ result = Transcription.new(@projectfile, @filehash).transform(xsltfile, xslt_param_array)
46
+ p = result.xpath("//p[@id='#{@pid}']")
47
+ return p
48
+ end
49
+ def transform_plain_text(xslt_param_array=[])
50
+ # not that it could be slightly confusing that paragraph plain text uses the transform clean,
51
+ # because we still the basic paragraph elements in order to select the desired paragraph
52
+ result = Transcription.new(@projectfile, @filehash).transform_clean(xslt_param_array)
53
+
54
+ p = result.xpath("//p[@id='#{@pid}']")
55
+ return p
56
+ end
57
+ def word_count
58
+ plaintext = self.transform_plain_text
59
+ size = plaintext.text.split.size
60
+ end
61
+ def word_array
62
+ plaintext = self.transform_plain_text
63
+ word_array = plaintext.text.split
64
+ word_array.map!{ |word| word.downcase}
65
+ end
66
+ def word_frequency(sort='frequency', order='descending')
67
+ word_array = self.word_array
68
+ wf = Hash.new(0)
69
+ word_array.each { |word| wf[word] += 1 }
70
+
71
+ if sort == "frequency"
72
+ if order == "descending" # high to low
73
+ wf = wf.sort_by{|k,v| v}.reverse
74
+ elsif order == "ascending" # low to high
75
+ wf = wf.sort_by{|k,v| v}
76
+ end
77
+ elsif sort == "word"
78
+ if order == "descending" # z - a
79
+ wf = wf.sort_by{|k,v| k}.reverse
80
+ elsif order == "ascending" #a - z
81
+ wf = wf.sort_by{|k,v| k}
82
+ end
83
+ end
84
+ return wf.to_h
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,249 @@
1
+ require 'nokogiri'
2
+ require 'rugged'
3
+ require 'lbp/functions'
4
+ require 'lbp/item'
5
+ require 'open-uri'
6
+
7
+ module Lbp
8
+ class Transcription
9
+ attr_reader :fs, :type, :ed, :xslt_dir
10
+
11
+ def initialize(projectfile, filehash)
12
+
13
+ @filehash = filehash
14
+ @projectfile = projectfile
15
+
16
+ @fs = filehash[:fs]
17
+ @type = filehash[:type] # critical or documentary
18
+ @ed = filehash[:ed]
19
+
20
+ @confighash = Collection.new(@projectfile).confighash
21
+ @xslthash = @confighash[:xslt_dirs]
22
+
23
+ #xslt version needs to gathered from a method
24
+ xslt_version = nil
25
+ #for now its being set to nil because no documents currently declare it
26
+
27
+ if xslt_version == nil
28
+ @schema = @xslthash["default"]
29
+ else
30
+ @schema = @xslthash[xslt_version]
31
+ end
32
+
33
+ if @type == 'critical'
34
+ @xslt_dir = @schema[:critical]
35
+ elsif @type == 'documentary'
36
+ @xslt_dir = @schema[:documentary]
37
+ end
38
+
39
+
40
+ if @filehash[:source] == 'local'
41
+ item = Item.new(@projectfile, @fs)
42
+ @current_branch = item.git_current_branch
43
+ # the effort here is to only set instance variable when absolutely necessary
44
+ if @current_branch != @ed
45
+ @item = item
46
+ end
47
+ end
48
+ end
49
+ ## Begin file path methods
50
+ # Returns the absolute path of the file requested
51
+ def file_path
52
+ @filehash[:path]
53
+ end
54
+ def file
55
+
56
+ file = open(self.file_path)
57
+ end
58
+ def nokogiri
59
+ xmldoc = Nokogiri::XML(self.file)
60
+
61
+ end
62
+ ## End File Path Methods
63
+ ### Item Header Extraction and Metadata Methods
64
+ def title
65
+ xmldoc = self.nokogiri
66
+
67
+ title = xmldoc.xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:titleStmt[1]/tei:title[1]", 'tei' => 'http://www.tei-c.org/ns/1.0')
68
+ return title.text
69
+ end
70
+ def author
71
+ xmldoc = self.nokogiri
72
+ author = xmldoc.xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc/tei:titleStmt[1]/tei:author", 'tei' => 'http://www.tei-c.org/ns/1.0')
73
+ return author.text
74
+ end
75
+ def editor
76
+ xmldoc = self.nokogiri
77
+ editor = xmldoc.xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc/tei:titleStmt[1]/tei:editor", 'tei' => 'http://www.tei-c.org/ns/1.0')
78
+ return editor.text
79
+ end
80
+ def ed_no
81
+ xmldoc = self.nokogiri
82
+ ed_no = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:editionStmt[1]/tei:edition[1]/@n", 'tei' => 'http://www.tei-c.org/ns/1.0')
83
+ return ed_no.value
84
+ end
85
+ def ed_date
86
+ xmldoc = self.nokogiri
87
+ ed_date = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:editionStmt[1]/tei:edition[1]/tei:date[1]/@when", 'tei' => 'http://www.tei-c.org/ns/1.0')
88
+ return ed_date.value
89
+ end
90
+ def pub_date
91
+ xmldoc = self.nokogiri
92
+ pub_date = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:publicationStmt[1]/tei:date[1]/@when", 'tei' => 'http://www.tei-c.org/ns/1.0')
93
+ return pub_date.value
94
+ end
95
+ def encoding_method
96
+ xmldoc = self.nokogiri
97
+ encoding_method = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:encodingDesc[1]/tei:variantEncoding[1]/@method", 'tei' => 'http://www.tei-c.org/ns/1.0')
98
+ return encoding_method.value
99
+ end
100
+ def encoding_location
101
+ xmldoc = self.nokogiri
102
+ encoding_location = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:encodingDesc[1]/tei:variantEncoding[1]/@location", 'tei' => 'http://www.tei-c.org/ns/1.0')
103
+ return encoding_location.value
104
+ end
105
+ def number_of_columns
106
+ xmldoc = self.nokogiri
107
+ test = xmldoc.xpath("//tei:pb", 'tei' => 'http://www.tei-c.org/ns/1.0')
108
+ if @type == "critical"
109
+ number_of_columns = nil
110
+ elsif xmldoc.xpath("//tei:pb", 'tei' => 'http://www.tei-c.org/ns/1.0').count != 0
111
+ number_of_columns = 1
112
+ elsif xmldoc.xpath("//tei:cb", 'tei' => 'http://www.tei-c.org/ns/1.0').count != 0
113
+ number_of_columns = 2
114
+ end
115
+ return number_of_columns
116
+ end
117
+
118
+ =begin - I think these methods belong with the Item or ItemRepo Object
119
+
120
+ ### End Header and Metadata Information Extraction Methods ###
121
+ ### Begin GIT functions ###
122
+ def is_git_dir
123
+ gitpath = @file_dir + ".git"
124
+
125
+ if File.directory?(gitpath)
126
+ true
127
+ else
128
+ false
129
+ end
130
+ end
131
+ def git_branches
132
+ repo = Rugged::Repository.new(@file_dir)
133
+ branches = repo.branches.map { |branch| branch.name }
134
+ return branches
135
+ end
136
+ def git_current_branch
137
+ repo = Rugged::Repository.new(@file_dir)
138
+ current_branch = repo.head.name.gsub(%r!\Arefs/heads/(.*)\z!) { $1 }
139
+ return current_branch
140
+ end
141
+ def git_tags
142
+ repo = Rugged::Repository.new(@file_dir)
143
+ tags = repo.tags.map { |tag| tag.name }
144
+ return tags
145
+ end
146
+ #need test for this
147
+ def git_checkout(branch)
148
+ repo = Rugged::Repository.new(@file_dir)
149
+ repo.checkout(branch)
150
+ end
151
+ ### End Git Methods ###
152
+ =end
153
+ ### Begin transform (XSLT) methocs ###
154
+ def transform(xsltfile, xslt_param_array=[])
155
+
156
+ xmlfile = self.file_path
157
+ if @current_branch != @ed && @filehash[:source] == 'local'
158
+ @item.git_checkout(@ed)
159
+ doc = xslt_transform(xmlfile, xsltfile, xslt_param_array)
160
+ @item.git_checkout(@current_branch);
161
+ else
162
+ doc = xslt_transform(xmlfile, xsltfile, xslt_param_array)
163
+ end
164
+ end
165
+
166
+ def transform_main_view(xslt_param_array=[])
167
+ xsltfile=@xslt_dir + @schema[:main_view] # "text_display.xsl"
168
+ doc = self.transform(xsltfile, xslt_param_array=[])
169
+ end
170
+ def transform_index_view(xslt_param_array=[])
171
+ xsltfile=@xslt_dir + @schema[:index_view] # "text_display_index.xsl"
172
+ doc = self.transform( xsltfile, xslt_param_array=[])
173
+ end
174
+ def transform_clean(xslt_param_array=[])
175
+ xsltfile=@xslt_dir + @schema[:clean_view] # "clean_forStatistics.xsl"
176
+ doc = self.transform(xsltfile, xslt_param_array=[])
177
+ end
178
+ def transform_plain_text(xslt_param_array=[])
179
+ xsltfile=@xslt_dir + @schema[:plain_text] # "plaintext.xsl"
180
+ doc = self.transform(xsltfile, xslt_param_array=[])
181
+ end
182
+ def transform_toc(xslt_param_array=[])
183
+ xsltfile=@xslt_dir + @schema[:toc] # "lectio_outline.xsl"
184
+ doc = self.transform(xsltfile, xslt_param_array=[])
185
+ end
186
+ ### End of Transformation Methods ###
187
+ ### Begin Statistics Methods ###
188
+ def word_count
189
+ plaintext = self.transform_plain_text
190
+ size = plaintext.text.split.size
191
+ end
192
+ def word_array
193
+ plaintext = self.transform_plain_text
194
+ word_array = plaintext.text.split
195
+ word_array.map!{ |word| word.downcase}
196
+ end
197
+ def word_frequency(sort, order)
198
+ word_array = self.word_array
199
+ wf = Hash.new(0)
200
+ word_array.each { |word| wf[word] += 1 }
201
+
202
+ if sort == "frequency"
203
+ if order == "descending" # high to low
204
+ wf = wf.sort_by{|k,v| v}.reverse
205
+ elsif order == "ascending" # low to high
206
+ wf = wf.sort_by{|k,v| v}
207
+ end
208
+ elsif sort == "word"
209
+ if order == "descending" # z - a
210
+ wf = wf.sort_by{|k,v| k}.reverse
211
+ elsif order == "ascending" #a - z
212
+ wf = wf.sort_by{|k,v| k}
213
+ end
214
+ end
215
+ return wf.to_h
216
+ end
217
+ def number_of_body_paragraphs
218
+ if @current_branch != @ed && @filehash[:source] == 'local'
219
+ @item.git_checkout(@ed)
220
+ xmldoc = self.nokogiri
221
+ p = xmldoc.xpath("//tei:body//tei:p", 'tei' => 'http://www.tei-c.org/ns/1.0')
222
+ @item.git_checkout(@current_branch);
223
+ else
224
+ xmldoc = self.nokogiri
225
+ p = xmldoc.xpath("//tei:body//tei:p", 'tei' => 'http://www.tei-c.org/ns/1.0')
226
+ end
227
+ return p.count
228
+ end
229
+ def paragraphs
230
+ ## it's not good to keep reusing this, git check out condition. Need a better solution
231
+ if @current_branch != @ed && @filehash[:source] == 'local'
232
+ @item.git_checkout(@ed)
233
+ xmldoc = self.nokogiri
234
+ paragraphs = xmldoc.xpath("//tei:body//tei:p/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
235
+ @item.git_checkout(@current_branch);
236
+ else
237
+ xmldoc = self.nokogiri
238
+ paragraphs = xmldoc.xpath("//tei:body//tei:p/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
239
+ end
240
+
241
+ paragraph_objects = paragraphs.map do |p| Paragraph.new(@projectfile, @filehash, p.value) end
242
+
243
+ return paragraph_objects
244
+ end
245
+ def paragraph(pid)
246
+ Paragraph.new(@projectfile, @filehash, pid)
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,3 @@
1
+ module Lbp
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,60 @@
1
+ require 'spec_helper'
2
+ require 'lbp'
3
+ require 'pry'
4
+ require 'nokogiri'
5
+
6
+ describe 'collection object' do
7
+ require_relative "config_globals"
8
+
9
+ $collection_obj = Lbp::Collection.new($pg_projectfile)
10
+
11
+ it 'should get list of item filestems in sequenced array' do
12
+ result = $collection_obj.item_filestems
13
+ expect(result).to be_kind_of(Array)
14
+ end
15
+ it 'should get a list of item names in sequenced array' do
16
+ result = $collection_obj.item_titles
17
+ expect(result).to be_kind_of(Array)
18
+ end
19
+ it 'should return a hash of filestems and item names' do
20
+ result = $collection_obj.items_fs_title_hash
21
+ expect(result).to be_kind_of(Hash)
22
+ end
23
+
24
+
25
+
26
+ it 'should get list of item objects in an array' do
27
+ result = $collection_obj.items
28
+ #reunning result.first.title returns ERROR!!!
29
+ expect(result).to be_kind_of(Array)
30
+ end
31
+
32
+ it 'should return local texts dir' do
33
+ result = $collection_obj.local_texts_dir
34
+ expect(result).to be_kind_of(String)
35
+ end
36
+
37
+ it 'should return general repo directory' do
38
+ result = $collection_obj.git_repo
39
+
40
+ expect(result).to be_kind_of(String)
41
+ end
42
+ it 'should return citation lists directory' do
43
+ result = $collection_obj.citation_lists_dir
44
+ expect(result).to be_kind_of(String)
45
+ end
46
+ it 'should return xslt hash' do
47
+ result = $collection_obj.xslt_dirs
48
+ expect(result).to be_kind_of(Hash)
49
+ end
50
+ it 'should return a specific item object when a specific item group id is given' do
51
+ result = $collection_obj.item('lectio1')
52
+ expect(result).to be_kind_of(Lbp::Item)
53
+ end
54
+ it 'should return the title of a given collection specified in the project data file' do
55
+ result = $collection_obj.title
56
+ expect(result).to be_kind_of(String)
57
+ end
58
+
59
+
60
+ end