lbp 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,52 @@
1
+ require 'nokogiri'
2
+ require 'rugged'
3
+ require 'lbp/functions'
4
+ require 'lbp/transcription'
5
+
6
+ module Lbp
7
+ class ItemGroup
8
+ attr_reader :igid
9
+
10
+ def initialize(projectfile, igid)
11
+ @igid = igid
12
+ @projectfile = projectfile
13
+
14
+ end
15
+
16
+ def items
17
+ file = Nokogiri::XML(File.read(@projectfile))
18
+ result = file.xpath("//div[@id='#{@igid}']//item/fileName/@filestem")
19
+ fs_array = result.map do |fs|
20
+ Item.new(@projectfile, fs.value)
21
+ end
22
+ return fs_array
23
+ end
24
+ def item(fs)
25
+ Item.new(@projectfile, fs)
26
+ end
27
+ def title
28
+ file = Nokogiri::XML(File.read(@projectfile))
29
+ result = file.xpath("//div[@id='#{@igid}']/head")
30
+ return result.text
31
+ end
32
+ def has_sub_group?
33
+ file = Nokogiri::XML(File.read(@projectfile))
34
+ result = file.xpath("//div[@id='#{@igid}']//div")
35
+ if result.count == 0
36
+ false
37
+ else
38
+ true
39
+ end
40
+ end
41
+ def has_parent_group?
42
+ #I sort of hate this method. But it sort of works, though I can imagine problems.
43
+ file = Nokogiri::XML(File.read(@projectfile))
44
+ result = file.xpath("//div[@id='#{@igid}'][@class='toplevel']")
45
+ if result.count == 0
46
+ true
47
+ else
48
+ false
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,87 @@
1
+ require 'nokogiri'
2
+ require 'rugged'
3
+ require 'lbp/functions'
4
+
5
+ module Lbp
6
+ class Paragraph
7
+ attr_reader :pid
8
+ def initialize(projectfile, filehash, pid)
9
+
10
+ @projectfile = projectfile
11
+ @filehash = filehash
12
+ @pid = pid
13
+
14
+ @confighash = Collection.new(@projectfile)
15
+ end
16
+
17
+ def number
18
+ transcr = Transcription.new(@projectfile, @filehash)
19
+ totalparagraphs = transcr.number_of_body_paragraphs
20
+ xmlobject = transcr.nokogiri
21
+ paragraphs_following = xmlobject.xpath("//tei:body//tei:p[preceding::tei:p[@xml:id='#{@pid}']]", 'tei' => 'http://www.tei-c.org/ns/1.0').count
22
+ paragraph_number = totalparagraphs - paragraphs_following
23
+
24
+ return paragraph_number
25
+ end
26
+ def next
27
+ xmlobject = Transcription.new(@projectfile, @filehash).nokogiri
28
+ nextpid = xmlobject.xpath("//tei:p[@xml:id='#{@pid}']/following::tei:p[1]/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
29
+ if nextpid.text == nil
30
+ return nil
31
+ else
32
+ return Paragraph.new(@projectfile, @filehash, nextpid.text)
33
+ end
34
+ end
35
+ def previous
36
+ xmlobject = Transcription.new(@projectfile, @filehash).nokogiri
37
+ previouspid = xmlobject.xpath("//tei:p[@xml:id='#{@pid}']/preceding::tei:p[1]/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
38
+ if previouspid.empty?
39
+ return nil
40
+ else
41
+ return Paragraph.new(@projectfile, @filehash, previouspid.text)
42
+ end
43
+ end
44
+ def transform(xsltfile, xslt_param_array=[])
45
+ result = Transcription.new(@projectfile, @filehash).transform(xsltfile, xslt_param_array)
46
+ p = result.xpath("//p[@id='#{@pid}']")
47
+ return p
48
+ end
49
+ def transform_plain_text(xslt_param_array=[])
50
+ # not that it could be slightly confusing that paragraph plain text uses the transform clean,
51
+ # because we still the basic paragraph elements in order to select the desired paragraph
52
+ result = Transcription.new(@projectfile, @filehash).transform_clean(xslt_param_array)
53
+
54
+ p = result.xpath("//p[@id='#{@pid}']")
55
+ return p
56
+ end
57
+ def word_count
58
+ plaintext = self.transform_plain_text
59
+ size = plaintext.text.split.size
60
+ end
61
+ def word_array
62
+ plaintext = self.transform_plain_text
63
+ word_array = plaintext.text.split
64
+ word_array.map!{ |word| word.downcase}
65
+ end
66
+ def word_frequency(sort='frequency', order='descending')
67
+ word_array = self.word_array
68
+ wf = Hash.new(0)
69
+ word_array.each { |word| wf[word] += 1 }
70
+
71
+ if sort == "frequency"
72
+ if order == "descending" # high to low
73
+ wf = wf.sort_by{|k,v| v}.reverse
74
+ elsif order == "ascending" # low to high
75
+ wf = wf.sort_by{|k,v| v}
76
+ end
77
+ elsif sort == "word"
78
+ if order == "descending" # z - a
79
+ wf = wf.sort_by{|k,v| k}.reverse
80
+ elsif order == "ascending" #a - z
81
+ wf = wf.sort_by{|k,v| k}
82
+ end
83
+ end
84
+ return wf.to_h
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,249 @@
1
+ require 'nokogiri'
2
+ require 'rugged'
3
+ require 'lbp/functions'
4
+ require 'lbp/item'
5
+ require 'open-uri'
6
+
7
+ module Lbp
8
+ class Transcription
9
+ attr_reader :fs, :type, :ed, :xslt_dir
10
+
11
+ def initialize(projectfile, filehash)
12
+
13
+ @filehash = filehash
14
+ @projectfile = projectfile
15
+
16
+ @fs = filehash[:fs]
17
+ @type = filehash[:type] # critical or documentary
18
+ @ed = filehash[:ed]
19
+
20
+ @confighash = Collection.new(@projectfile).confighash
21
+ @xslthash = @confighash[:xslt_dirs]
22
+
23
+ #xslt version needs to gathered from a method
24
+ xslt_version = nil
25
+ #for now its being set to nil because no documents currently declare it
26
+
27
+ if xslt_version == nil
28
+ @schema = @xslthash["default"]
29
+ else
30
+ @schema = @xslthash[xslt_version]
31
+ end
32
+
33
+ if @type == 'critical'
34
+ @xslt_dir = @schema[:critical]
35
+ elsif @type == 'documentary'
36
+ @xslt_dir = @schema[:documentary]
37
+ end
38
+
39
+
40
+ if @filehash[:source] == 'local'
41
+ item = Item.new(@projectfile, @fs)
42
+ @current_branch = item.git_current_branch
43
+ # the effort here is to only set instance variable when absolutely necessary
44
+ if @current_branch != @ed
45
+ @item = item
46
+ end
47
+ end
48
+ end
49
+ ## Begin file path methods
50
+ # Returns the absolute path of the file requested
51
+ def file_path
52
+ @filehash[:path]
53
+ end
54
+ def file
55
+
56
+ file = open(self.file_path)
57
+ end
58
+ def nokogiri
59
+ xmldoc = Nokogiri::XML(self.file)
60
+
61
+ end
62
+ ## End File Path Methods
63
+ ### Item Header Extraction and Metadata Methods
64
+ def title
65
+ xmldoc = self.nokogiri
66
+
67
+ title = xmldoc.xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:titleStmt[1]/tei:title[1]", 'tei' => 'http://www.tei-c.org/ns/1.0')
68
+ return title.text
69
+ end
70
+ def author
71
+ xmldoc = self.nokogiri
72
+ author = xmldoc.xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc/tei:titleStmt[1]/tei:author", 'tei' => 'http://www.tei-c.org/ns/1.0')
73
+ return author.text
74
+ end
75
+ def editor
76
+ xmldoc = self.nokogiri
77
+ editor = xmldoc.xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc/tei:titleStmt[1]/tei:editor", 'tei' => 'http://www.tei-c.org/ns/1.0')
78
+ return editor.text
79
+ end
80
+ def ed_no
81
+ xmldoc = self.nokogiri
82
+ ed_no = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:editionStmt[1]/tei:edition[1]/@n", 'tei' => 'http://www.tei-c.org/ns/1.0')
83
+ return ed_no.value
84
+ end
85
+ def ed_date
86
+ xmldoc = self.nokogiri
87
+ ed_date = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:editionStmt[1]/tei:edition[1]/tei:date[1]/@when", 'tei' => 'http://www.tei-c.org/ns/1.0')
88
+ return ed_date.value
89
+ end
90
+ def pub_date
91
+ xmldoc = self.nokogiri
92
+ pub_date = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:fileDesc[1]/tei:publicationStmt[1]/tei:date[1]/@when", 'tei' => 'http://www.tei-c.org/ns/1.0')
93
+ return pub_date.value
94
+ end
95
+ def encoding_method
96
+ xmldoc = self.nokogiri
97
+ encoding_method = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:encodingDesc[1]/tei:variantEncoding[1]/@method", 'tei' => 'http://www.tei-c.org/ns/1.0')
98
+ return encoding_method.value
99
+ end
100
+ def encoding_location
101
+ xmldoc = self.nokogiri
102
+ encoding_location = xmldoc.at_xpath("/tei:TEI/tei:teiHeader[1]/tei:encodingDesc[1]/tei:variantEncoding[1]/@location", 'tei' => 'http://www.tei-c.org/ns/1.0')
103
+ return encoding_location.value
104
+ end
105
+ def number_of_columns
106
+ xmldoc = self.nokogiri
107
+ test = xmldoc.xpath("//tei:pb", 'tei' => 'http://www.tei-c.org/ns/1.0')
108
+ if @type == "critical"
109
+ number_of_columns = nil
110
+ elsif xmldoc.xpath("//tei:pb", 'tei' => 'http://www.tei-c.org/ns/1.0').count != 0
111
+ number_of_columns = 1
112
+ elsif xmldoc.xpath("//tei:cb", 'tei' => 'http://www.tei-c.org/ns/1.0').count != 0
113
+ number_of_columns = 2
114
+ end
115
+ return number_of_columns
116
+ end
117
+
118
+ =begin - I think these methods belong with the Item or ItemRepo Object
119
+
120
+ ### End Header and Metadata Information Extraction Methods ###
121
+ ### Begin GIT functions ###
122
+ def is_git_dir
123
+ gitpath = @file_dir + ".git"
124
+
125
+ if File.directory?(gitpath)
126
+ true
127
+ else
128
+ false
129
+ end
130
+ end
131
+ def git_branches
132
+ repo = Rugged::Repository.new(@file_dir)
133
+ branches = repo.branches.map { |branch| branch.name }
134
+ return branches
135
+ end
136
+ def git_current_branch
137
+ repo = Rugged::Repository.new(@file_dir)
138
+ current_branch = repo.head.name.gsub(%r!\Arefs/heads/(.*)\z!) { $1 }
139
+ return current_branch
140
+ end
141
+ def git_tags
142
+ repo = Rugged::Repository.new(@file_dir)
143
+ tags = repo.tags.map { |tag| tag.name }
144
+ return tags
145
+ end
146
+ #need test for this
147
+ def git_checkout(branch)
148
+ repo = Rugged::Repository.new(@file_dir)
149
+ repo.checkout(branch)
150
+ end
151
+ ### End Git Methods ###
152
+ =end
153
+ ### Begin transform (XSLT) methocs ###
154
+ def transform(xsltfile, xslt_param_array=[])
155
+
156
+ xmlfile = self.file_path
157
+ if @current_branch != @ed && @filehash[:source] == 'local'
158
+ @item.git_checkout(@ed)
159
+ doc = xslt_transform(xmlfile, xsltfile, xslt_param_array)
160
+ @item.git_checkout(@current_branch);
161
+ else
162
+ doc = xslt_transform(xmlfile, xsltfile, xslt_param_array)
163
+ end
164
+ end
165
+
166
+ def transform_main_view(xslt_param_array=[])
167
+ xsltfile=@xslt_dir + @schema[:main_view] # "text_display.xsl"
168
+ doc = self.transform(xsltfile, xslt_param_array=[])
169
+ end
170
+ def transform_index_view(xslt_param_array=[])
171
+ xsltfile=@xslt_dir + @schema[:index_view] # "text_display_index.xsl"
172
+ doc = self.transform( xsltfile, xslt_param_array=[])
173
+ end
174
+ def transform_clean(xslt_param_array=[])
175
+ xsltfile=@xslt_dir + @schema[:clean_view] # "clean_forStatistics.xsl"
176
+ doc = self.transform(xsltfile, xslt_param_array=[])
177
+ end
178
+ def transform_plain_text(xslt_param_array=[])
179
+ xsltfile=@xslt_dir + @schema[:plain_text] # "plaintext.xsl"
180
+ doc = self.transform(xsltfile, xslt_param_array=[])
181
+ end
182
+ def transform_toc(xslt_param_array=[])
183
+ xsltfile=@xslt_dir + @schema[:toc] # "lectio_outline.xsl"
184
+ doc = self.transform(xsltfile, xslt_param_array=[])
185
+ end
186
+ ### End of Transformation Methods ###
187
+ ### Begin Statistics Methods ###
188
+ def word_count
189
+ plaintext = self.transform_plain_text
190
+ size = plaintext.text.split.size
191
+ end
192
+ def word_array
193
+ plaintext = self.transform_plain_text
194
+ word_array = plaintext.text.split
195
+ word_array.map!{ |word| word.downcase}
196
+ end
197
+ def word_frequency(sort, order)
198
+ word_array = self.word_array
199
+ wf = Hash.new(0)
200
+ word_array.each { |word| wf[word] += 1 }
201
+
202
+ if sort == "frequency"
203
+ if order == "descending" # high to low
204
+ wf = wf.sort_by{|k,v| v}.reverse
205
+ elsif order == "ascending" # low to high
206
+ wf = wf.sort_by{|k,v| v}
207
+ end
208
+ elsif sort == "word"
209
+ if order == "descending" # z - a
210
+ wf = wf.sort_by{|k,v| k}.reverse
211
+ elsif order == "ascending" #a - z
212
+ wf = wf.sort_by{|k,v| k}
213
+ end
214
+ end
215
+ return wf.to_h
216
+ end
217
+ def number_of_body_paragraphs
218
+ if @current_branch != @ed && @filehash[:source] == 'local'
219
+ @item.git_checkout(@ed)
220
+ xmldoc = self.nokogiri
221
+ p = xmldoc.xpath("//tei:body//tei:p", 'tei' => 'http://www.tei-c.org/ns/1.0')
222
+ @item.git_checkout(@current_branch);
223
+ else
224
+ xmldoc = self.nokogiri
225
+ p = xmldoc.xpath("//tei:body//tei:p", 'tei' => 'http://www.tei-c.org/ns/1.0')
226
+ end
227
+ return p.count
228
+ end
229
+ def paragraphs
230
+ ## it's not good to keep reusing this, git check out condition. Need a better solution
231
+ if @current_branch != @ed && @filehash[:source] == 'local'
232
+ @item.git_checkout(@ed)
233
+ xmldoc = self.nokogiri
234
+ paragraphs = xmldoc.xpath("//tei:body//tei:p/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
235
+ @item.git_checkout(@current_branch);
236
+ else
237
+ xmldoc = self.nokogiri
238
+ paragraphs = xmldoc.xpath("//tei:body//tei:p/@xml:id", 'tei' => 'http://www.tei-c.org/ns/1.0')
239
+ end
240
+
241
+ paragraph_objects = paragraphs.map do |p| Paragraph.new(@projectfile, @filehash, p.value) end
242
+
243
+ return paragraph_objects
244
+ end
245
+ def paragraph(pid)
246
+ Paragraph.new(@projectfile, @filehash, pid)
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,3 @@
1
+ module Lbp
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,60 @@
1
+ require 'spec_helper'
2
+ require 'lbp'
3
+ require 'pry'
4
+ require 'nokogiri'
5
+
6
+ describe 'collection object' do
7
+ require_relative "config_globals"
8
+
9
+ $collection_obj = Lbp::Collection.new($pg_projectfile)
10
+
11
+ it 'should get list of item filestems in sequenced array' do
12
+ result = $collection_obj.item_filestems
13
+ expect(result).to be_kind_of(Array)
14
+ end
15
+ it 'should get a list of item names in sequenced array' do
16
+ result = $collection_obj.item_titles
17
+ expect(result).to be_kind_of(Array)
18
+ end
19
+ it 'should return a hash of filestems and item names' do
20
+ result = $collection_obj.items_fs_title_hash
21
+ expect(result).to be_kind_of(Hash)
22
+ end
23
+
24
+
25
+
26
+ it 'should get list of item objects in an array' do
27
+ result = $collection_obj.items
28
+ #reunning result.first.title returns ERROR!!!
29
+ expect(result).to be_kind_of(Array)
30
+ end
31
+
32
+ it 'should return local texts dir' do
33
+ result = $collection_obj.local_texts_dir
34
+ expect(result).to be_kind_of(String)
35
+ end
36
+
37
+ it 'should return general repo directory' do
38
+ result = $collection_obj.git_repo
39
+
40
+ expect(result).to be_kind_of(String)
41
+ end
42
+ it 'should return citation lists directory' do
43
+ result = $collection_obj.citation_lists_dir
44
+ expect(result).to be_kind_of(String)
45
+ end
46
+ it 'should return xslt hash' do
47
+ result = $collection_obj.xslt_dirs
48
+ expect(result).to be_kind_of(Hash)
49
+ end
50
+ it 'should return a specific item object when a specific item group id is given' do
51
+ result = $collection_obj.item('lectio1')
52
+ expect(result).to be_kind_of(Lbp::Item)
53
+ end
54
+ it 'should return the title of a given collection specified in the project data file' do
55
+ result = $collection_obj.title
56
+ expect(result).to be_kind_of(String)
57
+ end
58
+
59
+
60
+ end