tbx_importer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.travis.yml +3 -0
- data/Gemfile +4 -0
- data/README.md +71 -0
- data/Rakefile +1 -0
- data/lib/tbx_importer/version.rb +3 -0
- data/lib/tbx_importer.rb +150 -0
- data/spec/sample_files/sample_1(utf-16).tbx +0 -0
- data/spec/sample_files/sample_1(utf-8).tbx +59 -0
- data/spec/spec_helper.rb +2 -0
- data/spec/tbx_importer_spec.rb +83 -0
- data/tbx_importer.gemspec +27 -0
- metadata +144 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 4f732759e6d8b7243d4ee79cc9926309cdda6ea1
|
4
|
+
data.tar.gz: 27f497202083e911be6c9ab6c8aa2ccec4d977f5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2c5e2f90baded7436b56c8f522b455cb4b696f857579376cdd51b8bfcff8ea300f395f47b054ed32c04e58bc7812e6e752cee63cde6815d68d1c13d4ce978580
|
7
|
+
data.tar.gz: 30e08a3ab20b63d3e3459a5f73d0d05cb69c3490b03f46e2f5f64e10345b1ff0950b2c63996b4566233d9590d89da9fa77cc64760555cd95fb66b291b742bc43
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
# TBX (TermBase eXchange) Importer
|
2
|
+
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/tbx_importer.svg)](https://badge.fury.io/rb/tbx_importer) [![Build Status](https://travis-ci.org/diasks2/tbx_importer.png)](https://travis-ci.org/diasks2/tbx_importer) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/diasks2/tbx_importer/blob/master/LICENSE.txt)
|
4
|
+
|
5
|
+
This gem handles the importing and parsing of [.tbx files](http://www.ttt.org/oscarStandards/tbx/tbx_oscar.pdf). [TMX files](http://www.ttt.org/tbx/) are xml files.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
**Ruby**
|
12
|
+
```
|
13
|
+
gem install tbx_importer
|
14
|
+
```
|
15
|
+
|
16
|
+
**Ruby on Rails**
|
17
|
+
Add this line to your application’s Gemfile:
|
18
|
+
```ruby
|
19
|
+
gem 'tbx_importer'
|
20
|
+
```
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
# Get the high level stats of a TBX file
|
26
|
+
# Including the encoding is optional. If not included the gem will attempt to detect the encoding.
|
27
|
+
file_path = File.expand_path('../tbx_importer/spec/sample_files/sample.tbx')
|
28
|
+
tbx = TbxImporter::Tbx.new(file_path: file_path)
|
29
|
+
tbx.stats
|
30
|
+
# => {:tc_count=>1, :term_count=>3, :language_pairs=>[["en", "fr"], ["en", "es"]]}
|
31
|
+
|
32
|
+
# Extract the segments of a TBX file
|
33
|
+
# Result: [term concepts, terms]
|
34
|
+
# term concepts = [tu_id, definition]
|
35
|
+
# terms = [tu_id, language, part_of_speech, term]
|
36
|
+
|
37
|
+
tbx.import
|
38
|
+
# => [[["6234-1457917153-1"], "the earth, together with all of its countries, peoples, and natural features.""], [["6234-1457917153-1", "en", "noun", world"], ["6234-1457917153-1", "fr", "noun", "monde"], ["6234-1457917153-1", "es", "noun", "mundo"]]]
|
39
|
+
```
|
40
|
+
|
41
|
+
## Contributing
|
42
|
+
|
43
|
+
1. Fork it ( https://github.com/diasks2/tbx_importer/fork )
|
44
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
45
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
46
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
47
|
+
5. Create a new Pull Request
|
48
|
+
|
49
|
+
## License
|
50
|
+
|
51
|
+
The MIT License (MIT)
|
52
|
+
|
53
|
+
Copyright (c) 2016 Kevin S. Dias
|
54
|
+
|
55
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
56
|
+
of this software and associated documentation files (the "Software"), to deal
|
57
|
+
in the Software without restriction, including without limitation the rights
|
58
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
59
|
+
copies of the Software, and to permit persons to whom the Software is
|
60
|
+
furnished to do so, subject to the following conditions:
|
61
|
+
|
62
|
+
The above copyright notice and this permission notice shall be included in
|
63
|
+
all copies or substantial portions of the Software.
|
64
|
+
|
65
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
66
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
67
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
68
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
69
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
70
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
71
|
+
THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/lib/tbx_importer.rb
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
require 'tbx_importer/version'
|
2
|
+
require 'xml'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'pretty_strings'
|
5
|
+
require 'charlock_holmes'
|
6
|
+
|
7
|
+
module TbxImporter
|
8
|
+
class Tbx
|
9
|
+
attr_reader :file_path, :encoding
|
10
|
+
def initialize(file_path:, **args)
|
11
|
+
@file_path = file_path
|
12
|
+
@content = File.read(open(@file_path)) if !args[:encoding].eql?('UTF-8')
|
13
|
+
if args[:encoding].nil?
|
14
|
+
@encoding = CharlockHolmes::EncodingDetector.detect(@content[0..100_000])[:encoding]
|
15
|
+
if @encoding.nil?
|
16
|
+
encoding_in_file = @content.dup.force_encoding('utf-8').scrub!("*").gsub!(/\0/, '').scan(/(?<=encoding=").*(?=")/)[0].upcase
|
17
|
+
if encoding_in_file.eql?('UTF-8')
|
18
|
+
@encoding = ('UTF-8')
|
19
|
+
elsif encoding_in_file.eql?('UTF-16')
|
20
|
+
@encoding = ('UTF-16LE')
|
21
|
+
end
|
22
|
+
end
|
23
|
+
else
|
24
|
+
@encoding = args[:encoding].upcase
|
25
|
+
end
|
26
|
+
@doc = {
|
27
|
+
source_language: "",
|
28
|
+
tc: { id: "", counter: 0, vals: [], lang: "", definition: "" },
|
29
|
+
term: { lang: "", counter: 0, vals: [], part_of_speech: "" },
|
30
|
+
language_pairs: [],
|
31
|
+
term_entry: false
|
32
|
+
}
|
33
|
+
raise "Encoding type could not be determined. Please set an encoding of UTF-8, UTF-16LE, or UTF-16BE" if @encoding.nil?
|
34
|
+
raise "Encoding type not supported. Please choose an encoding of UTF-8, UTF-16LE, or UTF-16BE" unless @encoding.eql?('UTF-8') || @encoding.eql?('UTF-16LE') || @encoding.eql?('UTF-16BE')
|
35
|
+
@text = CharlockHolmes::Converter.convert(@content, @encoding, 'UTF-8') if !@encoding.eql?('UTF-8')
|
36
|
+
end
|
37
|
+
|
38
|
+
def stats
|
39
|
+
if encoding.eql?('UTF-8')
|
40
|
+
analyze_stats_utf_8
|
41
|
+
else
|
42
|
+
analyze_stats_utf_16
|
43
|
+
end
|
44
|
+
{tc_count: @doc[:tc][:counter], term_count: @doc[:term][:counter], language_pairs: @doc[:language_pairs].uniq}
|
45
|
+
end
|
46
|
+
|
47
|
+
def import
|
48
|
+
reader = read_file
|
49
|
+
parse_file(reader)
|
50
|
+
[@doc[:tc][:vals], @doc[:term][:vals]]
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def analyze_stats_utf_8
|
56
|
+
File.readlines(@file_path).each do |line|
|
57
|
+
analyze_line(line)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def analyze_stats_utf_16
|
62
|
+
@text.each_line do |line|
|
63
|
+
analyze_line(line)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def read_file
|
68
|
+
if encoding.eql?('UTF-8')
|
69
|
+
XML::Reader.io(open(file_path), options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
|
70
|
+
else
|
71
|
+
reader = @text.gsub(/(?<=encoding=").*(?=")/, 'utf-8').gsub(/&#x[0-1]?[0-9a-fA-F];/, ' ').gsub(/[\0-\x1f\x7f\u2028]/, ' ')
|
72
|
+
XML::Reader.string(reader, options: XML::Parser::Options::NOERROR, encoding: XML::Encoding::UTF_8)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def analyze_line(line)
|
77
|
+
@doc[:term_entry] = true if !line.scan(/termEntry/).nil? && !line.scan(/termEntry/).empty?
|
78
|
+
if line.scan(/(?<=lang=\S)\S+(?=')/).nil? || line.scan(/(?<=lang=\S)\S+(?=')/).empty?
|
79
|
+
language = line.scan(/(?<=lang=\S)\S+(?=")/).uniq
|
80
|
+
else
|
81
|
+
language = line.scan(/(?<=lang=\S)\S+(?=')/).uniq
|
82
|
+
end
|
83
|
+
@doc[:source_language] = language[0] if line.include?('lang=') && !language.empty? && @doc[:source_language].empty? && @doc[:term_entry]
|
84
|
+
@doc[:tc][:counter] += line.scan(/<\/termEntry>/).count
|
85
|
+
@doc[:term][:counter] += line.scan(/<\/term>/).count
|
86
|
+
if !@doc[:source_language].empty?
|
87
|
+
language.each_with_index do |lang, index|
|
88
|
+
next if @doc[:source_language].eql?(lang)
|
89
|
+
@doc[:language_pairs] << [@doc[:source_language], lang]
|
90
|
+
end
|
91
|
+
@doc[:language_pairs] = @doc[:language_pairs].uniq
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def parse_file(reader)
|
96
|
+
tag_stack = []
|
97
|
+
generate_unique_id
|
98
|
+
while reader.read do
|
99
|
+
if !tag_stack.include?(reader.name)
|
100
|
+
tag_stack.push(reader.name)
|
101
|
+
eval_state(tag_stack, reader)
|
102
|
+
elsif tag_stack.last == reader.name
|
103
|
+
if tag_stack.pop.bytes.to_a == [116, 101, 114, 109, 69, 110, 116, 114, 121]
|
104
|
+
generate_unique_id
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
reader.close
|
109
|
+
end
|
110
|
+
|
111
|
+
def eval_state(tag_stack,reader)
|
112
|
+
case tag_stack.last.bytes.to_a
|
113
|
+
when [109, 97, 114, 116, 105, 102] #martif
|
114
|
+
@doc[:lang] = reader.get_attribute("lang") || reader.get_attribute("xml:lang")
|
115
|
+
@doc[:language_pairs] << @doc[:lang]
|
116
|
+
when [116, 101, 114, 109, 69, 110, 116, 114, 121] #termEntry
|
117
|
+
write_tc
|
118
|
+
when [108, 97, 110, 103, 83, 101, 116] #langSet
|
119
|
+
@doc[:term][:lang] = reader.get_attribute("lang") || reader.get_attribute("xml:lang")
|
120
|
+
@doc[:language_pairs] << @doc[:term][:lang]
|
121
|
+
when [116, 101, 114, 109] #term
|
122
|
+
write_term(reader)
|
123
|
+
when [116, 101, 114, 109, 78, 111, 116, 101] #termNote
|
124
|
+
@doc[:term][:part_of_speech] = PrettyStrings::Cleaner.new(reader.read_string.downcase).pretty.gsub("\\","\").gsub("'",%q(\\\')) if reader.get_attribute("type").eql?("partOfSpeech")
|
125
|
+
@doc[:term][:vals].pop
|
126
|
+
write_term(reader)
|
127
|
+
when [100, 101, 115, 99, 114, 105, 112] #descrip
|
128
|
+
@doc[:tc][:definition] = PrettyStrings::Cleaner.new(reader.read_string).pretty.gsub("\\","\").gsub("'",%q(\\\')) if reader.get_attribute("type").eql?("definition")
|
129
|
+
@doc[:tc][:vals].pop
|
130
|
+
write_tc
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
def write_tc
|
135
|
+
@doc[:tc][:vals] << [@doc[:tc][:id], @doc[:tc][:definition]]
|
136
|
+
@doc[:tc][:definition] = ""
|
137
|
+
end
|
138
|
+
|
139
|
+
def write_term(reader)
|
140
|
+
return if reader.read_string.nil?
|
141
|
+
text = PrettyStrings::Cleaner.new(reader.read_string).pretty.gsub("\\","\").gsub("'",%q(\\\'))
|
142
|
+
word_count = text.gsub("\s+", ' ').split(' ').length
|
143
|
+
@doc[:term][:vals] << [@doc[:tc][:id], @doc[:term][:lang], @doc[:term][:part_of_speech], text]
|
144
|
+
end
|
145
|
+
|
146
|
+
def generate_unique_id
|
147
|
+
@doc[:tc][:id] = [(1..4).map{rand(10)}.join(''), Time.now.to_i, @doc[:tc][:counter] += 1 ].join("-")
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
Binary file
|
@@ -0,0 +1,59 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?><martif type="TBX" xml:lang="en-US">
|
2
|
+
<martifHeader>
|
3
|
+
<fileDesc>
|
4
|
+
<titleStmt>
|
5
|
+
<title>Microsoft Terminology Collection Export</title>
|
6
|
+
</titleStmt>
|
7
|
+
<sourceDesc><p>Microsoft Terminology Collection</p></sourceDesc>
|
8
|
+
</fileDesc>
|
9
|
+
</martifHeader>
|
10
|
+
<text>
|
11
|
+
<body>
|
12
|
+
<termEntry id="2_76365">
|
13
|
+
<langSet xml:lang="en-US">
|
14
|
+
<descripGrp>
|
15
|
+
<descrip type="definition">To terminate a session with a computer accessed through a communications line usually a computer that is both distant and open to many users.</descrip>
|
16
|
+
</descripGrp>
|
17
|
+
<ntig>
|
18
|
+
<termGrp>
|
19
|
+
<term id="76365">log off</term>
|
20
|
+
<termNote type="partOfSpeech">Verb</termNote>
|
21
|
+
</termGrp>
|
22
|
+
</ntig>
|
23
|
+
</langSet>
|
24
|
+
<langSet xml:lang="fr-fr">
|
25
|
+
<ntig>
|
26
|
+
<termGrp>
|
27
|
+
<term id="215541">fermer une session</term>
|
28
|
+
<termNote type="partOfSpeech">Verb</termNote>
|
29
|
+
</termGrp>
|
30
|
+
</ntig>
|
31
|
+
</langSet>
|
32
|
+
</termEntry>
|
33
|
+
<termEntry id="3_106184"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">To shorten the fractional part of a number, increasing the last remaining (rightmost) digit or not, according to whether the deleted portion was over or under five.</descrip></descripGrp><ntig><termGrp><term id="106184">round</term><termNote type="partOfSpeech">Verb</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="106187">arrondir</term><termNote type="partOfSpeech">Verb</termNote></termGrp></ntig></langSet></termEntry><termEntry id="4_156085"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">Personally identifiable information (PII) that is protected in special ways by law or policy.</descrip></descripGrp><ntig><termGrp><term id="156085">sensitive data</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="156096">données sensibles</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="5_182425"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A business unit that is immediately under another business unit in the business hierarchy of an organization.</descrip></descripGrp><ntig><termGrp><term id="182425">child business unit</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="182429">sous-division</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="7_96886"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">The mode of a command-line application where it does not display confirmation messages or any other user interface items that normally appear on screen. The switch for quiet mode is typically '/q'.</descrip></descripGrp><ntig><termGrp><term id="96886">quiet mode</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="212192">mode silencieux</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="9_179350"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A virtual hard disk that points to and uses an entire physical disk for the purpose of converting a data disk to a virtual hard disk. You cannot turn on a virtual machine if a linked disk is attached to the virtual machine.</descrip></descripGrp><ntig><termGrp><term id="179350">linked virtual hard disk</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="179352">disque dur virtuel lié</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="10_307453"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">An interface that is used to navigate the menus of a Unified Messaging (UM) system using DTMF or touchtone inputs.</descrip></descripGrp><ntig><termGrp><term id="307453">Telephone User Interface</term><termNote type="partOfSpeech">Proper Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="318298">interface utilisateur de téléphonie</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="10_307467"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">An interface that is used to navigate the menus of a Unified Messaging (UM) system using DTMF or touchtone inputs.</descrip></descripGrp><ntig><termGrp><term id="307467">TUI</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="318299">TUI</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="10_1218391"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">An interface that is used to navigate the menus of a Unified Messaging (UM) system using DTMF or touchtone inputs.</descrip></descripGrp><ntig><termGrp><term id="1218391">touchtone interface</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="1783210">interface à tonalité</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry>
|
34
|
+
<termEntry id="12_233129">
|
35
|
+
<langSet xml:lang="en-US">
|
36
|
+
<descripGrp>
|
37
|
+
<descrip type="definition">The part of a service order that specifies detailed information about the requested service.</descrip>
|
38
|
+
</descripGrp>
|
39
|
+
<ntig>
|
40
|
+
<termGrp>
|
41
|
+
<term id="233129">service order line</term>
|
42
|
+
<termNote type="partOfSpeech">Noun</termNote>
|
43
|
+
</termGrp></ntig>
|
44
|
+
</langSet>
|
45
|
+
<langSet xml:lang="fr-fr">
|
46
|
+
<ntig>
|
47
|
+
<termGrp>
|
48
|
+
<term id="324824">ligne d'ordre de service</term>
|
49
|
+
<termNote type="partOfSpeech">Noun</termNote>
|
50
|
+
</termGrp>
|
51
|
+
</ntig>
|
52
|
+
<ntig>
|
53
|
+
<termGrp>
|
54
|
+
<term id="1950253">ligne commande service</term>
|
55
|
+
<termNote type="partOfSpeech">Noun</termNote>
|
56
|
+
</termGrp>
|
57
|
+
</ntig>
|
58
|
+
</langSet>
|
59
|
+
</termEntry><termEntry id="15_397203"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A logical grouping of managed folders. When a managed folder mailbox policy is applied to a user’s mailbox, all the managed folders that are linked to the policy are deployed in a single operation, thereby making the deployment of messaging records management (MRM) easier.</descrip></descripGrp><ntig><termGrp><term id="397203">managed folder mailbox policy</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="410598">Stratégie de boîte aux lettres de dossier géré</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="16_494929"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">The space between the inside edge of a child element and its content.</descrip></descripGrp><ntig><termGrp><term id="494929">padding</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="499409">marge intérieure</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="20_103799"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">For failover clusters or server clusters, a physical or logical entity that is capable of being managed by a cluster, brought online and taken offline, and moved between nodes. A resource can be owned only by a single node at any point in time.</descrip></descripGrp><ntig><termGrp><term id="103799">resource</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="103802">ressource</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="21_216347"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A field that can be used in a workflow rule or template as a placeholder for specific values or text in an e-mail. When you send an e-mail, these placeholders are replaced with the data that meet the conditions that have been set in the workflow rule. For example, if you send an e-mail to an account holder about a contract cancellation date, you can use a dynamic data field to automatically find and insert the correct cancellation date for that contract.</descrip></descripGrp><ntig><termGrp><term id="216347">dynamic data field</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="218231">champ de données dynamiques</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="22_364026"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A component of the World Wide Web Publishing Service (WWW service) in IIS that is responsible for configuration, by means of the metabase, and for worker process management.</descrip></descripGrp><ntig><termGrp><term id="364026">WWW Service Administration and Monitoring component</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="365290">composant Analyse et administration du service WWW</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="23_461846"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A tool on the main Windows SteadyState interface used to set schedules for software and operating system updates.</descrip></descripGrp><ntig><termGrp><term id="461846">Schedule Important Software Updates</term><termNote type="partOfSpeech">Proper Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="514693">Planification de mises à jour logicielles importantes</term><termNote type="partOfSpeech">Proper Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="24_537631"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A component that can be registered as part of the ASP.NET request lifecycle and that can read or change the request or response as it is processed. HttpModules are often used to perform special tasks that need to monitor each request, such as security or site statistics.</descrip></descripGrp><ntig><termGrp><term id="537631">HTTP module</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="541917">module HTTP</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="25_42999"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">In communications, the portion of a Data Terminal Equipment (DTE) device that sends data.</descrip></descripGrp><ntig><termGrp><term id="42999">data source</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="43001">source de données</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="26_147281"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">To deliver data to a client without a client request for the data.</descrip></descripGrp><ntig><termGrp><term id="147281">push</term><termNote type="partOfSpeech">Verb</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="147288">effectuer une transmission de type push</term><termNote type="partOfSpeech">Verb</termNote></termGrp></ntig></langSet></termEntry><termEntry id="27_136378"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A transmission medium designed for high-speed data transfers over long distances. Cable modem services and Digital Subscriber Line (DSL) are examples of broadband networks.</descrip></descripGrp><ntig><termGrp><term id="136378">broadband</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="212456">haut débit</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="27_216700"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A transmission medium designed for high-speed data transfers over long distances. Cable modem services and Digital Subscriber Line (DSL) are examples of broadband networks.</descrip></descripGrp><ntig><termGrp><term id="216700">broadband network</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="216717">réseau haut débit</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="29_45263"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A mark placed over, under, or through a character, usually to indicate a change in phonetic value from the unmarked state.</descrip></descripGrp><ntig><termGrp><term id="45263">diacritical mark</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="45266">signe diacritique</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="29_45264"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A mark placed over, under, or through a character, usually to indicate a change in phonetic value from the unmarked state.</descrip></descripGrp><ntig><termGrp><term id="45264">diacritic</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="45266">signe diacritique</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="30_237215"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">The part of a scenario that determines its uniqueness from other scenarios. </descrip></descripGrp><ntig><termGrp><term id="237215">differentiating factor</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="276593">facteur de différenciation</term><termNote type="partOfSpeech">Noun</termNote></termGrp></ntig></langSet></termEntry><termEntry id="31_1167251"><langSet xml:lang="en-US"><descripGrp><descrip type="definition">A shared service in Windows SharePoint Services and SharePoint Server that provides a means for storing, securing, and administering external content types and related objects. </descrip></descripGrp><ntig><termGrp><term id="1167251">Business Data Connectivity service</term><termNote type="partOfSpeech">Proper Noun</termNote></termGrp></ntig></langSet><langSet xml:lang="fr-fr"><ntig><termGrp><term id="1167718">Service BDC</term><termNote type="partOfSpeech">Proper Noun</termNote></termGrp></ntig></langSet></termEntry></body></text></martif>
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe TbxImporter do
|
4
|
+
it 'has a version number' do
|
5
|
+
expect(TbxImporter::VERSION).not_to be nil
|
6
|
+
end
|
7
|
+
|
8
|
+
describe '#stats' do
|
9
|
+
it 'reports the stats of a UTF-8 TBX file' do
|
10
|
+
file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-8).tbx')
|
11
|
+
tbx = TbxImporter::Tbx.new(file_path: file_path)
|
12
|
+
expect(tbx.stats).to eq({:tc_count=>25, :term_count=>51, :language_pairs=>[["en-US", "fr-fr"]]})
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'reports the stats of a UTF-16 TBX file' do
|
16
|
+
file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-16).tbx')
|
17
|
+
tbx = TbxImporter::Tbx.new(file_path: file_path)
|
18
|
+
expect(tbx.stats).to eq({:tc_count=>10, :term_count=>21, :language_pairs=>[["EN-US", "PL"], ['EN-US', 'FR']]})
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
describe '#import' do
|
23
|
+
it 'imports a UTF-8 TBX file' do
|
24
|
+
file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-8).tbx')
|
25
|
+
tbx = TbxImporter::Tbx.new(file_path: file_path).import
|
26
|
+
expect(tbx[0].length).to eq(25)
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'imports a UTF-8 TBX file 2' do
|
30
|
+
file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-8).tbx')
|
31
|
+
tbx = TbxImporter::Tbx.new(file_path: file_path).import
|
32
|
+
expect(tbx[1].length).to eq(51)
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'imports a UTF-8 TBX file 3' do
|
36
|
+
file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-8).tbx')
|
37
|
+
tbx = TbxImporter::Tbx.new(file_path: file_path).import
|
38
|
+
expect(tbx[0][-1][0]).to eq(tbx[1][-1][0])
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'imports a UTF-8 TBX file 4' do
|
42
|
+
file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-8).tbx')
|
43
|
+
tbx = TbxImporter::Tbx.new(file_path: file_path).import
|
44
|
+
expect(tbx[0][1][1]).to eq("To shorten the fractional part of a number, increasing the last remaining (rightmost) digit or not, according to whether the deleted portion was over or under five.")
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'imports a UTF-8 TBX file 5' do
|
48
|
+
file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-8).tbx')
|
49
|
+
tbx = TbxImporter::Tbx.new(file_path: file_path).import
|
50
|
+
expect(tbx[1][1][2]).to eq("verb")
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'imports a UTF-16 TBX file' do
|
54
|
+
file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-16).tbx')
|
55
|
+
tbx = TbxImporter::Tbx.new(file_path: file_path).import
|
56
|
+
expect(tbx[0].length).to eq(10)
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'imports a UTF-16 TBX file 2' do
|
60
|
+
file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-16).tbx')
|
61
|
+
tbx = TbxImporter::Tbx.new(file_path: file_path).import
|
62
|
+
expect(tbx[1].length).to eq(21)
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'imports a UTF-16 TBX file 3' do
|
66
|
+
file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-16).tbx')
|
67
|
+
tbx = TbxImporter::Tbx.new(file_path: file_path).import
|
68
|
+
expect(tbx[0][-1][0]).to eq(tbx[1][-1][0])
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'imports a UTF-16 TBX file 4' do
|
72
|
+
file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-16).tbx')
|
73
|
+
tbx = TbxImporter::Tbx.new(file_path: file_path).import
|
74
|
+
expect(tbx[0][1][1]).to eq('')
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'imports a UTF-16 TBX file 5' do
|
78
|
+
file_path = File.expand_path('../tbx_importer/spec/sample_files/sample_1(utf-16).tbx')
|
79
|
+
tbx = TbxImporter::Tbx.new(file_path: file_path).import
|
80
|
+
expect(tbx[1][1][2]).to eq('')
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'tbx_importer/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "tbx_importer"
|
8
|
+
spec.version = TbxImporter::VERSION
|
9
|
+
spec.authors = ["Kevin S. Dias"]
|
10
|
+
spec.email = ["diasks2@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{TBX (TermBase eXchange) file importer}
|
13
|
+
spec.description = %q{Import the content of a TBX (TermBase eXchange) file}
|
14
|
+
spec.homepage = "https://github.com/diasks2/tbx_importer"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.9"
|
22
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
23
|
+
spec.add_development_dependency "rspec"
|
24
|
+
spec.add_runtime_dependency "libxml-ruby"
|
25
|
+
spec.add_runtime_dependency "pretty_strings", "~> 0.7.0"
|
26
|
+
spec.add_runtime_dependency "charlock_holmes_bundle_icu", "~> 0.6.9.2"
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tbx_importer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Kevin S. Dias
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-03-17 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.9'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.9'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: libxml-ruby
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: pretty_strings
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 0.7.0
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.7.0
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: charlock_holmes_bundle_icu
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.6.9.2
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.6.9.2
|
97
|
+
description: Import the content of a TBX (TermBase eXchange) file
|
98
|
+
email:
|
99
|
+
- diasks2@gmail.com
|
100
|
+
executables: []
|
101
|
+
extensions: []
|
102
|
+
extra_rdoc_files: []
|
103
|
+
files:
|
104
|
+
- ".gitignore"
|
105
|
+
- ".rspec"
|
106
|
+
- ".travis.yml"
|
107
|
+
- Gemfile
|
108
|
+
- README.md
|
109
|
+
- Rakefile
|
110
|
+
- lib/tbx_importer.rb
|
111
|
+
- lib/tbx_importer/version.rb
|
112
|
+
- spec/sample_files/sample_1(utf-16).tbx
|
113
|
+
- spec/sample_files/sample_1(utf-8).tbx
|
114
|
+
- spec/spec_helper.rb
|
115
|
+
- spec/tbx_importer_spec.rb
|
116
|
+
- tbx_importer.gemspec
|
117
|
+
homepage: https://github.com/diasks2/tbx_importer
|
118
|
+
licenses: []
|
119
|
+
metadata: {}
|
120
|
+
post_install_message:
|
121
|
+
rdoc_options: []
|
122
|
+
require_paths:
|
123
|
+
- lib
|
124
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
125
|
+
requirements:
|
126
|
+
- - ">="
|
127
|
+
- !ruby/object:Gem::Version
|
128
|
+
version: '0'
|
129
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
130
|
+
requirements:
|
131
|
+
- - ">="
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
requirements: []
|
135
|
+
rubyforge_project:
|
136
|
+
rubygems_version: 2.4.1
|
137
|
+
signing_key:
|
138
|
+
specification_version: 4
|
139
|
+
summary: TBX (TermBase eXchange) file importer
|
140
|
+
test_files:
|
141
|
+
- spec/sample_files/sample_1(utf-16).tbx
|
142
|
+
- spec/sample_files/sample_1(utf-8).tbx
|
143
|
+
- spec/spec_helper.rb
|
144
|
+
- spec/tbx_importer_spec.rb
|