dphil 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +6 -0
- data/LICENSE +201 -0
- data/README.md +54 -0
- data/Rakefile +11 -0
- data/dphil.gemspec +49 -0
- data/exe/dphil +10 -0
- data/lib/dphil.rb +53 -0
- data/lib/dphil/cache.rb +15 -0
- data/lib/dphil/change_list.rb +6 -0
- data/lib/dphil/character.rb +236 -0
- data/lib/dphil/character_matrix.rb +102 -0
- data/lib/dphil/cli.rb +26 -0
- data/lib/dphil/cli_commands/csv2ld.rb +71 -0
- data/lib/dphil/cli_commands/csv2nex.rb +37 -0
- data/lib/dphil/constants.rb +128 -0
- data/lib/dphil/converter.rb +58 -0
- data/lib/dphil/converters/csv2nex.rb +83 -0
- data/lib/dphil/ld_data_set.rb +25 -0
- data/lib/dphil/ld_output.rb +29 -0
- data/lib/dphil/lemma.rb +44 -0
- data/lib/dphil/lemma_list.rb +179 -0
- data/lib/dphil/log_formatter.rb +39 -0
- data/lib/dphil/logger.rb +27 -0
- data/lib/dphil/metrical_data.rb +78 -0
- data/lib/dphil/newick.rb +52 -0
- data/lib/dphil/paup.rb +34 -0
- data/lib/dphil/refinements.rb +8 -0
- data/lib/dphil/refinements/natural_sort.rb +52 -0
- data/lib/dphil/script_string.rb +124 -0
- data/lib/dphil/syllables.rb +43 -0
- data/lib/dphil/syllables/syllable.rb +45 -0
- data/lib/dphil/tei_xml.rb +142 -0
- data/lib/dphil/transliterate.rb +131 -0
- data/lib/dphil/tree.rb +142 -0
- data/lib/dphil/tree_node.rb +67 -0
- data/lib/dphil/verse.rb +25 -0
- data/lib/dphil/verse_analysis.rb +509 -0
- data/lib/dphil/verse_analysis_new.rb +816 -0
- data/lib/dphil/version.rb +30 -0
- data/vendor/default_commands.paup +18 -0
- data/vendor/metrical_data.yml +4035 -0
- metadata +409 -0
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
Dphil::CLI.module_eval do
|
4
|
+
desc "Convert a CSV-format collation file into a NEXUS file"
|
5
|
+
long_desc <<~EOS
|
6
|
+
Convert a CSV-format collation file into a NEXUS file for use with PAUP.
|
7
|
+
This expects each column of the CSV to represent data for a single taxon,
|
8
|
+
and the first row to contain the names of the taxa.
|
9
|
+
EOS
|
10
|
+
|
11
|
+
arg :csv_file
|
12
|
+
|
13
|
+
command :csv2nex do |c|
|
14
|
+
c.desc "Transpose rows/columns in CSV"
|
15
|
+
c.switch :t, :transpose, negatable: false
|
16
|
+
|
17
|
+
c.desc "Include custom PAUP commands from a file in PAUP block of NEXUS output"
|
18
|
+
c.flag :d, :paup_data, arg_name: "file"
|
19
|
+
|
20
|
+
c.desc "Write NEXUS output to file instead of STDOUT"
|
21
|
+
c.flag :o, :outfile, arg_name: "file"
|
22
|
+
|
23
|
+
c.action do |_, copts, args|
|
24
|
+
nexus_output = Dphil::Csv2NexConverter.new(args[0], copts).convert
|
25
|
+
|
26
|
+
if copts[:outfile].nil?
|
27
|
+
puts nexus_output
|
28
|
+
else
|
29
|
+
abs_outfile = Pathname.new(copts[:outfile]).expand_path
|
30
|
+
rel_outfile = abs_outfile.relative_path_from(Pathname.getwd)
|
31
|
+
puts "#{File.write(abs_outfile, nexus_output)} bytes written to #{rel_outfile}"
|
32
|
+
puts "You can process this file using PAUP with the command\n" \
|
33
|
+
"`paup4 [options] #{rel_outfile}`"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "set"
|
4
|
+
|
5
|
+
module Dphil
|
6
|
+
module Constants
|
7
|
+
using ::Ragabash::Refinements
|
8
|
+
DEBUG = if defined?(::Rails) && ::Rails.env[/^dev/]
|
9
|
+
true
|
10
|
+
elsif !ENV["RUBY_ENV"].nil? && ENV["RUBY_ENV"][/^dev/]
|
11
|
+
true
|
12
|
+
else
|
13
|
+
false
|
14
|
+
end
|
15
|
+
|
16
|
+
# Regular expressions for SLP1 syllables
|
17
|
+
begin
|
18
|
+
vow = "aAiIuUfFxXeEoO"
|
19
|
+
con = "kKgGNcCjJYwWqQRtTdDnpPbBmyrlvzSsh"
|
20
|
+
add = "MH"
|
21
|
+
|
22
|
+
R_SYL = /[']?[#{con}]*[\s]*[#{vow}][#{con}#{add}]*(?![#{vow}])\s*/
|
23
|
+
R_GSYL = /[AIUFXeEoO]|[MH]$/
|
24
|
+
R_CCONF = /[#{con}]{2}$/
|
25
|
+
R_CCON = /[#{con}]{2}/
|
26
|
+
end
|
27
|
+
|
28
|
+
TRANS_CTRL_WORD = /\{#.*?#\}/
|
29
|
+
TRANS_CTRL_WORD_CONTENT = /\{#(.*?)#\}/
|
30
|
+
TRANS_CTRL_WORD_PROCESSED = /#[a-f0-9]{40}#/
|
31
|
+
|
32
|
+
# Linked Data types and contexts
|
33
|
+
begin
|
34
|
+
ld_context_global = {
|
35
|
+
"@version" => 1.1,
|
36
|
+
"oa" => "http://www.w3.org/ns/oa#",
|
37
|
+
"dc" => "http://purl.org/dc/elements/1.1/",
|
38
|
+
"xsd" => "http://www.w3.org/2001/XMLSchema#",
|
39
|
+
"ubcs" => "http://ld.ubcsanskrit.ca/api#",
|
40
|
+
"id" => { "@id" => "dc:identifier" },
|
41
|
+
}
|
42
|
+
|
43
|
+
ld_context_character = {
|
44
|
+
"states" => { "@id" => "ubcs:charStateBySymbol", "@container" => "@index" },
|
45
|
+
"symbols" => { "@id" => "ubcs:charSymbolByState", "@container" => "@index" },
|
46
|
+
"stateTotals" => { "@id" => "ubcs:charStateTotalsByState", "@container" => "@index" },
|
47
|
+
"state_totals" => { "@id" => "ubcs:charStateTotalsByState", "@container" => "@index" },
|
48
|
+
"taxaStates" => { "@id" => "ubcs:charStateByTaxon", "@container" => "@index" },
|
49
|
+
"taxa_states" => { "@id" => "ubcs:charStateByTaxon", "@container" => "@index" },
|
50
|
+
"statesTaxa" => { "@id" => "ubcs:taxonByCharState", "@container" => "@index" },
|
51
|
+
"states_taxa" => { "@id" => "ubcs:taxonByCharState", "@container" => "@index" },
|
52
|
+
"isInformative" => { "@id" => "ubcs:charStateIsInformative" },
|
53
|
+
"is_informative" => { "@id" => "ubcs:charStateIsInformative" },
|
54
|
+
"isConstant" => { "@id" => "ubcs:charStateIsConstant" },
|
55
|
+
"is_constant" => { "@id" => "ubcs:charStateIsConstant" },
|
56
|
+
}
|
57
|
+
|
58
|
+
ld_context_matrix = {
|
59
|
+
"taxaNames" => { "@id" => "dc:identifier", "@container" => "@index" },
|
60
|
+
"taxa_names" => { "@id" => "dc:identifier", "@container" => "@index" },
|
61
|
+
"characters" => {
|
62
|
+
"@id" => "ubcs:phyloCharacter",
|
63
|
+
"@container" => "@index",
|
64
|
+
"@context" => ld_context_character,
|
65
|
+
},
|
66
|
+
}
|
67
|
+
|
68
|
+
ld_context_tree_node = {
|
69
|
+
"name" => { "@id" => "ubcs:treeNodeName" },
|
70
|
+
"length" => { "@id" => "ubcs:branchLength" },
|
71
|
+
"parent" => { "@id" => "ubcs:treeNodeParent" },
|
72
|
+
"children" => { "@id" => "ubcs:treeNodeChildren" },
|
73
|
+
}
|
74
|
+
|
75
|
+
ld_context_tree = {
|
76
|
+
"rootId" => { "@id" => "ubcs:treeRootId" },
|
77
|
+
"root_id" => { "@id" => "ubcs:treeRootId" },
|
78
|
+
"nodes" => {
|
79
|
+
"@id" => "ubcs:treeNode",
|
80
|
+
"@container" => "@index",
|
81
|
+
"@context" => ld_context_tree_node,
|
82
|
+
},
|
83
|
+
"stats" => {
|
84
|
+
"@id" => "ubcs:treeStats",
|
85
|
+
"@context" => {
|
86
|
+
"ci" => { "@id" => "ubcs:treeCI" },
|
87
|
+
"ciEx" => { "@id" => "ubcs:treeCIEx" },
|
88
|
+
"ci_ex" => { "@id" => "ubcs:treeCIEx" },
|
89
|
+
"hi" => { "@id" => "ubcs:treeHI" },
|
90
|
+
"hiEx" => { "@id" => "ubcs:treeHIEx" },
|
91
|
+
"hi_ex" => { "@id" => "ubcs:treeHIEx" },
|
92
|
+
"length" => { "@id" => "ubcs:treeLengh" },
|
93
|
+
"rc" => { "@id" => "ubcs:treeRC" },
|
94
|
+
"ri" => { "@id" => "ubcs:treeRI" },
|
95
|
+
},
|
96
|
+
},
|
97
|
+
}
|
98
|
+
|
99
|
+
ld_context_dataset = {
|
100
|
+
"matrix" => {
|
101
|
+
"@id" => "ubcs:characterMatrix",
|
102
|
+
"@context" => ld_context_matrix,
|
103
|
+
},
|
104
|
+
"trees" => {
|
105
|
+
"@id" => "ubcs:tree",
|
106
|
+
"@container" => "@index",
|
107
|
+
"@context" => ld_context_tree,
|
108
|
+
},
|
109
|
+
}
|
110
|
+
|
111
|
+
LD_TYPES = {
|
112
|
+
"Dphil::Character" => "ubcs:phyloCharacter",
|
113
|
+
"Dphil::CharacterMatrix" => "ubcs:characterMatrix",
|
114
|
+
"Dphil::TreeNode" => "ubcs:treeNode",
|
115
|
+
"Dphil::Tree" => "ubcs:tree",
|
116
|
+
"Dphil::LDDataSet" => "ubcs:dataSet",
|
117
|
+
}.deep_freeze
|
118
|
+
|
119
|
+
LD_CONTEXTS = {
|
120
|
+
"Dphil::Character" => ld_context_global.merge(ld_context_character),
|
121
|
+
"Dphil::CharacterMatrix" => ld_context_global.merge(ld_context_matrix),
|
122
|
+
"Dphil::TreeNode" => ld_context_global.merge(ld_context_tree_node),
|
123
|
+
"Dphil::Tree" => ld_context_global.merge(ld_context_tree),
|
124
|
+
"Dphil::LDDataSet" => ld_context_global.merge(ld_context_dataset),
|
125
|
+
}.deep_freeze
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
#
|
5
|
+
# Base module for file converters (CSV, NEXUS, CollateX, etc.)
|
6
|
+
#
|
7
|
+
module Converter
|
8
|
+
private
|
9
|
+
|
10
|
+
# Load a file
|
11
|
+
def load_file(infile)
|
12
|
+
raise IOError, "File #{infile} not found." unless File.exist?(infile)
|
13
|
+
File.read(infile)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Load a CSV file
|
17
|
+
def load_csv(infile, mode = "r")
|
18
|
+
raise IOError, "File #{infile} not found." unless File.exist?(infile)
|
19
|
+
CSV.read(infile, mode)
|
20
|
+
end
|
21
|
+
|
22
|
+
# Return a hash of array sorted/weighted by number of identical entries
|
23
|
+
def weighted_uniq(array)
|
24
|
+
weighted_hash = array.each_with_object({}) do |v, acc|
|
25
|
+
acc[v] ||= 0
|
26
|
+
acc[v] += 1
|
27
|
+
end
|
28
|
+
n = 0
|
29
|
+
(weighted_hash.sort_by do |x|
|
30
|
+
n += 1
|
31
|
+
[-x[1], n]
|
32
|
+
end).to_h
|
33
|
+
end
|
34
|
+
|
35
|
+
# Sanitize a character string to basic KH/ASCII
|
36
|
+
def sanitize_char(str)
|
37
|
+
str = str.to_s
|
38
|
+
src = Sanscript.detect(str) || :iast
|
39
|
+
str = Sanscript.transliterate(str, src, :kh)
|
40
|
+
str.gsub!(/\s/, "_")
|
41
|
+
str.tr!("'", "`")
|
42
|
+
str.strip!
|
43
|
+
str
|
44
|
+
end
|
45
|
+
|
46
|
+
# Tokenize the values of a character
|
47
|
+
def tokenize(characters)
|
48
|
+
char_set = weighted_uniq(characters.map { |c| sanitize_char(c) }.reject(&:empty?))
|
49
|
+
char_set.each_with_object({}).with_index do |(char, acc), i|
|
50
|
+
acc[char[0]] = [ALPHABET[i], char[1]]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# NEX Token Alphabet
|
55
|
+
ALPHABET = IceNine.deep_freeze(("A".."Z").to_a + ("a".."z").to_a)
|
56
|
+
private_constant :ALPHABET
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
#
|
5
|
+
# CSV to NEXUS file converter class
|
6
|
+
#
|
7
|
+
class Csv2NexConverter
|
8
|
+
include Dphil::Converter
|
9
|
+
|
10
|
+
def initialize(csv_file, opts = {})
|
11
|
+
opts = opts.to_h
|
12
|
+
|
13
|
+
# Load csv file
|
14
|
+
@csv = load_csv(csv_file, "r:bom|utf-8")
|
15
|
+
@csv = @csv.transpose if opts[:transpose]
|
16
|
+
|
17
|
+
# Load paup file
|
18
|
+
if opts[:paup_data].nil?
|
19
|
+
opts[:paup_data] = File.join(GEM_ROOT, "vendor", "default_commands.paup")
|
20
|
+
end
|
21
|
+
@paup = load_file(opts[:paup_data])
|
22
|
+
@paup << "\n" unless @paup.blank? || @paup[-1] == "\n"
|
23
|
+
@paup.indent!(2)
|
24
|
+
@paup.freeze
|
25
|
+
end
|
26
|
+
|
27
|
+
# Perform the conversion and return a string result
|
28
|
+
def convert
|
29
|
+
# Setup taxa information and orientation
|
30
|
+
taxa_count = @csv.first.count
|
31
|
+
character_count = @csv.count - 1
|
32
|
+
taxa_labels = @csv.first.map { |name| name.to_s.strip.scrub.gsub(/[^A-Za-z0-9]/, "_") }
|
33
|
+
|
34
|
+
# Generate labels and matrix
|
35
|
+
character_labels = []
|
36
|
+
character_matrix = taxa_labels.map { |t| [t] }
|
37
|
+
(1..character_count).each do |r|
|
38
|
+
row = @csv[r]
|
39
|
+
token_hash = tokenize(row)
|
40
|
+
character_label = (token_hash.map do |k, _|
|
41
|
+
"'#{sanitize_char(k)}'"
|
42
|
+
end).join(" ")
|
43
|
+
character_labels << %(#{r} /#{character_label})
|
44
|
+
row.each_with_index do |charstate, i|
|
45
|
+
token = token_hash[sanitize_char(charstate)]
|
46
|
+
character_matrix[i] << (token.nil? ? "-" : token[0])
|
47
|
+
end
|
48
|
+
end
|
49
|
+
character_matrix.map! do |arr|
|
50
|
+
"#{arr.shift} #{arr.join('')}"
|
51
|
+
end
|
52
|
+
|
53
|
+
# Return NEXUS output
|
54
|
+
<<~NEXUS_EOF
|
55
|
+
#NEXUS
|
56
|
+
|
57
|
+
BEGIN TAXA;
|
58
|
+
TITLE Manuscripts;
|
59
|
+
DIMENSIONS NTAX=#{taxa_count};
|
60
|
+
TAXLABELS #{taxa_labels.join(' ')};
|
61
|
+
END;
|
62
|
+
|
63
|
+
BEGIN CHARACTERS;
|
64
|
+
TITLE Variant_Matrix;
|
65
|
+
DIMENSIONS NCHAR=#{character_count};
|
66
|
+
FORMAT DATATYPE = STANDARD RESPECTCASE GAP = - MISSING = ? SYMBOLS = "#{ALPHABET.join(' ')}";
|
67
|
+
CHARSTATELABELS #{character_labels.join(', ')};
|
68
|
+
MATRIX
|
69
|
+
#{character_matrix.join("\n ")}
|
70
|
+
;
|
71
|
+
|
72
|
+
END;
|
73
|
+
|
74
|
+
BEGIN ASSUMPTIONS;
|
75
|
+
OPTIONS DEFTYPE = UNORD;
|
76
|
+
END;
|
77
|
+
|
78
|
+
BEGIN PAUP;
|
79
|
+
#{@paup}END;
|
80
|
+
NEXUS_EOF
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
class LDDataSet
|
5
|
+
include Dphil::LDOutput
|
6
|
+
|
7
|
+
attr_reader :matrix, :trees
|
8
|
+
|
9
|
+
def initialize(matrix:, trees:)
|
10
|
+
@matrix = matrix
|
11
|
+
@trees = trees
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_h
|
15
|
+
{
|
16
|
+
matrix: matrix,
|
17
|
+
trees: trees,
|
18
|
+
}
|
19
|
+
end
|
20
|
+
|
21
|
+
def as_json(options = nil)
|
22
|
+
to_h.as_json(options)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
#
|
5
|
+
# Mixin module for Linked Data output
|
6
|
+
#
|
7
|
+
# Requires that a class implements +#as_json+
|
8
|
+
#
|
9
|
+
module LDOutput
|
10
|
+
using Dphil::Refinements::NaturalSort
|
11
|
+
# Outputs a Linked Data Hash
|
12
|
+
def as_jsonld(**options)
|
13
|
+
ld = {
|
14
|
+
"@context" => options.delete(:context) || Constants::LD_CONTEXTS[self.class.name],
|
15
|
+
"@type" => options.delete(:ld_type) || Constants::LD_TYPES[self.class.name],
|
16
|
+
}.merge!(as_json(options))
|
17
|
+
|
18
|
+
ld_expanded = JSON::LD::API.expand(ld)
|
19
|
+
return ld_expanded if options[:compact] == false
|
20
|
+
|
21
|
+
ld_compact = JSON::LD::API.compact(ld_expanded, ld["@context"])
|
22
|
+
{ "@context" => ld_compact.delete("@context") }.merge!(ld_compact.natural_sort_keys)
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_jsonld(**options)
|
26
|
+
as_jsonld(options).to_json(options)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/dphil/lemma.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
# Public: A storage object for words and groups of words from TEI XML data.
|
5
|
+
# Also contains information about the source/location of the words.
|
6
|
+
# Immutable.
|
7
|
+
class Lemma
|
8
|
+
using ::Ragabash::Refinements
|
9
|
+
# Public: Returns the raw source data for the lemma.
|
10
|
+
attr_reader :source, :text, :page, :facs, :line, :index
|
11
|
+
|
12
|
+
# Public: Initialize a lemma object.
|
13
|
+
#
|
14
|
+
# source - XML data to initialize the lemma from
|
15
|
+
def initialize(source = "", index = nil)
|
16
|
+
@source = source.strip
|
17
|
+
@index = index
|
18
|
+
|
19
|
+
xml = Nokogiri::XML("<lemma>#{source}</lemma>") { |config| config.strict.noent }
|
20
|
+
xml.encoding = "UTF-8"
|
21
|
+
|
22
|
+
@text = xml.text.strip.gsub(/\-+\s*\-*/, "")
|
23
|
+
@page = xml.css("pb").map { |el| el.attr("n") }.join(",")
|
24
|
+
@facs = xml.css("pb").map { |el| el.attr("facs") }.join(",")
|
25
|
+
@line = xml.css("lb").map { |el| el.attr("n") }.join(",")
|
26
|
+
rescue Nokogiri::XML::SyntaxError => e
|
27
|
+
$stderr.puts "Error in Lemma.new(`#{source}`, ...): #{e}"
|
28
|
+
abort
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_s
|
32
|
+
"(#{index}|#{page}:#{line}) #{text}"
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_sym
|
36
|
+
"<Lemma>#{self}".to_sym
|
37
|
+
end
|
38
|
+
|
39
|
+
def ==(other)
|
40
|
+
return false unless other.is_a?(Dphil::Lemma)
|
41
|
+
source == other.source
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,179 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "nokogiri"
|
4
|
+
|
5
|
+
module Dphil
|
6
|
+
# An object containing a list of lemmata generated through SAX parsing of an
|
7
|
+
# XML document.
|
8
|
+
# Immutable.
|
9
|
+
class LemmaList < ::Nokogiri::XML::SAX::Document
|
10
|
+
using ::Ragabash::Refinements
|
11
|
+
include Enumerable
|
12
|
+
|
13
|
+
attr_reader :name
|
14
|
+
|
15
|
+
def initialize(source)
|
16
|
+
@members = []
|
17
|
+
source = source.to_s.strip
|
18
|
+
return if source.empty?
|
19
|
+
@lemma_ignore_start_tags = Set.new(%w[TEI text body pre post div])
|
20
|
+
@lemma_ignore_end_tags = @lemma_ignore_start_tags + Set.new(%w[pb lb])
|
21
|
+
@index = 0
|
22
|
+
@open_elements = []
|
23
|
+
@current_pb = []
|
24
|
+
@current_lb = []
|
25
|
+
@current_chars = ""
|
26
|
+
@current_lemma = []
|
27
|
+
@inside_hyphen = false
|
28
|
+
@empty_element = true
|
29
|
+
|
30
|
+
@parser = Nokogiri::XML::SAX::Parser.new(self)
|
31
|
+
@parser.parse(source)
|
32
|
+
end
|
33
|
+
|
34
|
+
def each(&block)
|
35
|
+
@members.each(&block)
|
36
|
+
end
|
37
|
+
|
38
|
+
def members(limit = nil)
|
39
|
+
return @members[0, limit] if limit.is_a? Numeric
|
40
|
+
@members
|
41
|
+
end
|
42
|
+
|
43
|
+
def [](*args)
|
44
|
+
@members[*args]
|
45
|
+
end
|
46
|
+
|
47
|
+
def get(index)
|
48
|
+
raise "Non-numeric index passed to Lemma.get" unless index.is_a? Numeric
|
49
|
+
if index < 1
|
50
|
+
warn "Minimum index of Lemma.get() is 1"
|
51
|
+
index = 1
|
52
|
+
end
|
53
|
+
@members[index - 1]
|
54
|
+
end
|
55
|
+
|
56
|
+
def size
|
57
|
+
@members.size
|
58
|
+
end
|
59
|
+
|
60
|
+
def to_s
|
61
|
+
@members.map(&:text).join("\n")
|
62
|
+
end
|
63
|
+
|
64
|
+
def cx_tokens
|
65
|
+
@members.map do |lemma|
|
66
|
+
out = {
|
67
|
+
t: lemma.text,
|
68
|
+
n: Transliterate.normalize_iast(lemma.text),
|
69
|
+
i: lemma.index,
|
70
|
+
p: lemma.page,
|
71
|
+
f: lemma.facs,
|
72
|
+
l: lemma.line,
|
73
|
+
}
|
74
|
+
warn "Token empty: #{out}" if out[:t].empty?
|
75
|
+
out
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
|
81
|
+
def start_element(name, attrs = [])
|
82
|
+
return if @lemma_ignore_start_tags.include?(name)
|
83
|
+
|
84
|
+
if %w[pb lb].include?(name)
|
85
|
+
el = gen_xmlel(name, attrs, true)
|
86
|
+
if @current_lemma.empty?
|
87
|
+
instance_variable_set("@current_#{name}", [el])
|
88
|
+
else
|
89
|
+
instance_variable_get("@current_#{name}") << el
|
90
|
+
end
|
91
|
+
else
|
92
|
+
el = gen_xmlel(name, attrs)
|
93
|
+
@open_elements << gen_xmlel(name, attrs)
|
94
|
+
end
|
95
|
+
|
96
|
+
@empty_element = true
|
97
|
+
@current_lemma << el unless el.empty?
|
98
|
+
end
|
99
|
+
|
100
|
+
def end_element(name)
|
101
|
+
return if @lemma_ignore_end_tags.include?(name)
|
102
|
+
|
103
|
+
if @empty_element
|
104
|
+
@current_lemma[-1] = @current_lemma[-1].gsub(%r{/*>\z}, "/>")
|
105
|
+
@empty_element = false
|
106
|
+
else
|
107
|
+
@current_lemma << "</#{name}>"
|
108
|
+
end
|
109
|
+
@open_elements.pop
|
110
|
+
end
|
111
|
+
|
112
|
+
def characters(string)
|
113
|
+
@empty_element = false
|
114
|
+
string.split(/(\s)/).reject(&:empty?).each do |lemma|
|
115
|
+
@current_chars += lemma.strip
|
116
|
+
|
117
|
+
if lemma.match?(/\-$/)
|
118
|
+
@inside_hyphen = true
|
119
|
+
elsif lemma.match?(/^\-?[^\s]/)
|
120
|
+
@inside_hyphen = false
|
121
|
+
end
|
122
|
+
|
123
|
+
if lemma.match(/^\s+$/) && !@inside_hyphen
|
124
|
+
finalize
|
125
|
+
next
|
126
|
+
end
|
127
|
+
|
128
|
+
text = lemma.strip
|
129
|
+
@current_lemma << text unless text.empty?
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def end_document
|
134
|
+
finalize
|
135
|
+
(instance_variables - [:@members]).each do |var|
|
136
|
+
remove_instance_variable(var)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def gen_xmlel(name, attrs, self_closing = false)
|
141
|
+
attr_list = attrs.reduce("") do |result, attr|
|
142
|
+
%(#{result} #{attr[0]}="#{attr[1].gsub('"', '"')}")
|
143
|
+
end
|
144
|
+
self_closing ? "<#{name}#{attr_list}/>" : "<#{name}#{attr_list}>"
|
145
|
+
end
|
146
|
+
|
147
|
+
def gen_xmlclose(el)
|
148
|
+
el.gsub(/^<([^\s\>]+).*/, '</\\1>')
|
149
|
+
end
|
150
|
+
|
151
|
+
def append_lemma
|
152
|
+
return unless @current_chars.match?(/[^\s\-\.\|]+/) # if not .empty?
|
153
|
+
new_lemma_source = @current_lemma.join("")
|
154
|
+
new_lemma = Lemma.new(new_lemma_source, @index)
|
155
|
+
@index += 1
|
156
|
+
@members << new_lemma
|
157
|
+
end
|
158
|
+
|
159
|
+
def finalize
|
160
|
+
return if @current_lemma.empty?
|
161
|
+
@current_lemma.unshift(@current_lb.first) unless @current_lemma[0] == @current_lb.first
|
162
|
+
@current_lemma.unshift(@current_pb.first) unless @current_lemma[0] == @current_pb.first
|
163
|
+
|
164
|
+
# Make sure missing open or close tags are inserted
|
165
|
+
unless @open_elements.empty?
|
166
|
+
@current_lemma.concat(@open_elements.reverse.map { |e| gen_xmlclose(e) })
|
167
|
+
prime_next = @open_elements.dup
|
168
|
+
end
|
169
|
+
|
170
|
+
append_lemma
|
171
|
+
|
172
|
+
@current_pb = [@current_pb.last]
|
173
|
+
@current_lb = [@current_lb.last]
|
174
|
+
@current_chars = ""
|
175
|
+
@current_lemma = prime_next || []
|
176
|
+
@inside_hyphen = false
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|