chupa-text 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +12 -2
- data/doc/text/news.md +12 -0
- data/lib/chupa-text/decomposers/office-open-xml-workbook.rb +2 -2
- data/lib/chupa-text/decomposers/office-open-xml.rb +59 -6
- data/lib/chupa-text/version.rb +1 -1
- data/test/decomposers/test-office-open-xml-workbook.rb +76 -1
- data/test/fixture/xlsx/complex-shared-strings.xlsx +0 -0
- metadata +4 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 111930c1ba73f6eaae79fb538e34a51e72fd29768bca71ec89d43f041c34d960
|
|
4
|
+
data.tar.gz: 4051f7bc52f057a2e06cd7c7d23d3d8ce1755aae47b2352cc8d9e9e2dddfdfde
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9ef8a9c17bf65b1d1a7e26112390964738985c3c9b761dc37c157292a324045eba7565576523817fe738f6158aa1d0362260cf13d28360baec2aa8e9942f2fdc
|
|
7
|
+
data.tar.gz: 75f63640af7ad7a55d0925dcd37ee23e7bd64c83291d80f0796d6e18fff86f2b76340170c7fa1be572070a4bdee9502a8aa9dfbf7e9946fb4dbd06e6fdb1edbf
|
data/Rakefile
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
# -*-
|
|
1
|
+
# -*- ruby -*-
|
|
2
2
|
#
|
|
3
|
-
# Copyright (C) 2013 Kouhei
|
|
3
|
+
# Copyright (C) 2013-2025 Sutou Kouhei <kou@clear-code.com>
|
|
4
4
|
#
|
|
5
5
|
# This library is free software; you can redistribute it and/or
|
|
6
6
|
# modify it under the terms of the GNU Lesser General Public
|
|
@@ -46,3 +46,13 @@ desc "Run tests"
|
|
|
46
46
|
task :test do
|
|
47
47
|
ruby("test/run-test.rb")
|
|
48
48
|
end
|
|
49
|
+
|
|
50
|
+
release_task = Rake.application["release"]
|
|
51
|
+
# We use Trusted Publishing.
|
|
52
|
+
release_task.prerequisites.delete("build")
|
|
53
|
+
release_task.prerequisites.delete("release:rubygem_push")
|
|
54
|
+
release_task_comment = release_task.comment
|
|
55
|
+
if release_task_comment
|
|
56
|
+
release_task.clear_comments
|
|
57
|
+
release_task.comment = release_task_comment.gsub(/ and build.*$/, "")
|
|
58
|
+
end
|
data/doc/text/news.md
CHANGED
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
# News
|
|
2
2
|
|
|
3
|
+
## 1.3.6: 2025-01-10
|
|
4
|
+
|
|
5
|
+
### Fixes
|
|
6
|
+
|
|
7
|
+
* xlsx: Fixed a bug that wrong text is extracted with complex shared
|
|
8
|
+
strings.
|
|
9
|
+
* Reported by Tomohisa Kusukawa.
|
|
10
|
+
|
|
11
|
+
### Thanks
|
|
12
|
+
|
|
13
|
+
* Tomohisa Kusukawa
|
|
14
|
+
|
|
3
15
|
## 1.3.5: 2024-09-22
|
|
4
16
|
|
|
5
17
|
### Improvements
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2019-
|
|
1
|
+
# Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
|
|
2
2
|
#
|
|
3
3
|
# This library is free software; you can redistribute it and/or
|
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
|
@@ -49,7 +49,7 @@ module ChupaText
|
|
|
49
49
|
def process_entry(entry, context)
|
|
50
50
|
case entry.zip_path
|
|
51
51
|
when "xl/sharedStrings.xml"
|
|
52
|
-
|
|
52
|
+
extract_shared_strings(entry, context[:shared_strings])
|
|
53
53
|
when "xl/workbook.xml"
|
|
54
54
|
listener = WorkbookListener.new(context[:sheet_names])
|
|
55
55
|
parse(entry.file_data, listener)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
|
1
|
+
# Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
|
|
2
2
|
#
|
|
3
3
|
# This library is free software; you can redistribute it and/or
|
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
|
@@ -70,6 +70,11 @@ module ChupaText
|
|
|
70
70
|
parse(entry.file_data, listener)
|
|
71
71
|
end
|
|
72
72
|
|
|
73
|
+
def extract_shared_strings(entry, shared_strings)
|
|
74
|
+
listener = SharedStringsListener.new(shared_strings, @namespace_uri)
|
|
75
|
+
parse(entry.file_data, listener)
|
|
76
|
+
end
|
|
77
|
+
|
|
73
78
|
def log_tag
|
|
74
79
|
"[decomposer][office-open-xml]"
|
|
75
80
|
end
|
|
@@ -90,26 +95,74 @@ module ChupaText
|
|
|
90
95
|
end
|
|
91
96
|
|
|
92
97
|
def end_element(uri, local_name, qname)
|
|
98
|
+
if uri == @target_uri
|
|
99
|
+
case local_name
|
|
100
|
+
when "p", "br"
|
|
101
|
+
@output << "\n"
|
|
102
|
+
when "t"
|
|
103
|
+
@in_target = false
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def characters(text)
|
|
109
|
+
add_text(text)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def cdata(content)
|
|
113
|
+
add_text(content)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
private
|
|
117
|
+
def add_text(text)
|
|
118
|
+
return unless @in_target
|
|
119
|
+
@output << text
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
class SharedStringsListener < SAXListener
|
|
124
|
+
def initialize(output, target_uri)
|
|
125
|
+
@output = output
|
|
126
|
+
@target_uri = target_uri
|
|
127
|
+
@tag_stack = []
|
|
93
128
|
@in_target = false
|
|
129
|
+
@current_text = +""
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def start_element(uri, local_name, qname, attributes)
|
|
133
|
+
@tag_stack << local_name
|
|
134
|
+
|
|
135
|
+
return unless uri == @target_uri
|
|
136
|
+
case local_name
|
|
137
|
+
when "t"
|
|
138
|
+
@in_target = true
|
|
139
|
+
@current_text = +""
|
|
140
|
+
end
|
|
141
|
+
end
|
|
94
142
|
|
|
143
|
+
def end_element(uri, local_name, qname)
|
|
95
144
|
return unless uri == @target_uri
|
|
96
145
|
case local_name
|
|
97
|
-
when "
|
|
98
|
-
@
|
|
146
|
+
when "t"
|
|
147
|
+
add_text(@current_text)
|
|
148
|
+
@in_target = false
|
|
99
149
|
end
|
|
150
|
+
ensure
|
|
151
|
+
@tag_stack.pop
|
|
100
152
|
end
|
|
101
153
|
|
|
102
154
|
def characters(text)
|
|
103
|
-
|
|
155
|
+
@current_text << text if @in_target
|
|
104
156
|
end
|
|
105
157
|
|
|
106
158
|
def cdata(content)
|
|
107
|
-
|
|
159
|
+
@current_text << content if @in_target
|
|
108
160
|
end
|
|
109
161
|
|
|
110
162
|
private
|
|
111
163
|
def add_text(text)
|
|
112
|
-
|
|
164
|
+
parent_tag = @tag_stack[-2]
|
|
165
|
+
return unless parent_tag == "si"
|
|
113
166
|
@output << text
|
|
114
167
|
end
|
|
115
168
|
end
|
data/lib/chupa-text/version.rb
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2019 Kouhei
|
|
1
|
+
# Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
|
|
2
2
|
#
|
|
3
3
|
# This library is free software; you can redistribute it and/or
|
|
4
4
|
# modify it under the terms of the GNU Lesser General Public
|
|
@@ -153,6 +153,81 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
|
|
|
153
153
|
end
|
|
154
154
|
end
|
|
155
155
|
|
|
156
|
+
def test_complex_shared_strings
|
|
157
|
+
path = fixture_path("xlsx", "complex-shared-strings.xlsx")
|
|
158
|
+
actual = decompose(path).collect do |data|
|
|
159
|
+
[
|
|
160
|
+
data["index"],
|
|
161
|
+
data["name"],
|
|
162
|
+
data.body,
|
|
163
|
+
]
|
|
164
|
+
end
|
|
165
|
+
assert_equal([
|
|
166
|
+
[nil, nil, ""],
|
|
167
|
+
[
|
|
168
|
+
0,
|
|
169
|
+
"新規",
|
|
170
|
+
"No\t案件番号\t開始日\t期日\tステータス\t備考\n" +
|
|
171
|
+
"1\t-\t45664\t45672\t対応中\n" +
|
|
172
|
+
"2\t-\t45664\t45672\t対応中\n" +
|
|
173
|
+
"3\t-\t45664\t45672\t対応中\n" +
|
|
174
|
+
"4\t-\t45664\t45666\t対応中\n" +
|
|
175
|
+
"5\t-\t45664\t45666\t対応中\n" +
|
|
176
|
+
"6\t-\t45663\t45665\t承認待ち\n" +
|
|
177
|
+
"7\t-\t45660\t45665\t承認待ち\n" +
|
|
178
|
+
"8\t-\t45653\t45663\t承認待ち\n" +
|
|
179
|
+
"9\t-\t45653\t45663\t承認待ち\n" +
|
|
180
|
+
"10\tPSR2401770\t45652\t45666\t対応中\n",
|
|
181
|
+
],
|
|
182
|
+
[
|
|
183
|
+
1,
|
|
184
|
+
"全体",
|
|
185
|
+
"No\t案件番号\t開始日\t期日\tステータス\n" +
|
|
186
|
+
"1\tPSR2401564\t45617\t45726\t対応中\n" +
|
|
187
|
+
"2\tPSR2401194\t45553\t45716\t対応中\n" +
|
|
188
|
+
"3\t-\t45664\t45672\t対応中\n" +
|
|
189
|
+
"4\t-\t45664\t45672\t対応中\n" +
|
|
190
|
+
"5\t-\t45664\t45672\t対応中\n" +
|
|
191
|
+
"6\t-\t45645\t45672\t対応中\n" +
|
|
192
|
+
"7\tPSR2401746\t45649\t45671\t対応中\n" +
|
|
193
|
+
"8\t-\t45640\t45667\t対応中\n" +
|
|
194
|
+
"9\t-\t45635\t45667\t対応中\n" +
|
|
195
|
+
"10\tPSR2401605\t45623\t45667\t対応中\n" +
|
|
196
|
+
"11\t-\t45664\t45666\t対応中\n" +
|
|
197
|
+
"12\t-\t45664\t45666\t対応中\n" +
|
|
198
|
+
"13\tPSR2401770\t45652\t45666\t対応中\n" +
|
|
199
|
+
"14\t-\t45645\t45665\t対応中\n" +
|
|
200
|
+
"15\tPSR2401609\t45624\t45666\t対応中\n",
|
|
201
|
+
],
|
|
202
|
+
[
|
|
203
|
+
2,
|
|
204
|
+
"案件",
|
|
205
|
+
"No\t案件番号\t開始日\t対応完了時期想定\n" +
|
|
206
|
+
"1\tPSR2401244\t45561.40347222222\t45744\n" +
|
|
207
|
+
"2\tPSR2401592\t45621.598611111112\t45698\n" +
|
|
208
|
+
"3\tPSR2401682\t45638.40902777778\t45688\n" +
|
|
209
|
+
"4\tPSR2401706\t45643.383333333331\t45671\n" +
|
|
210
|
+
"5\tPSR2401779\t45653.490277777775\t45671\n" +
|
|
211
|
+
"6\tPSR2401805\t45664.436805555553\t調整中\n" +
|
|
212
|
+
"7\tPSR2400677\t45455.588194444441\t45651\t完了\n" +
|
|
213
|
+
"8\tPSR2401666\t45636.405555555553\t45653\t完了\n" +
|
|
214
|
+
"9\tPSR2401714\t45644.630555555559\t45652\t完了\n",
|
|
215
|
+
],
|
|
216
|
+
[
|
|
217
|
+
3,
|
|
218
|
+
"障害恒久対応・改善対応",
|
|
219
|
+
"No\t案件番号\t分類\t開始日\t対応完了時期想定\n" +
|
|
220
|
+
"1\tPSR2401334\t改善対応\t45576.411805555559\t45688\n" +
|
|
221
|
+
"2\tPSR2401335\t改善対応\t45576.415277777778\t45688\n" +
|
|
222
|
+
"3\tPSR2401410\t改善対応\t45588.428472222222\t調整中\n" +
|
|
223
|
+
"4\tPSR2401411\t改善対応\t45588.432638888888\t調整中\n" +
|
|
224
|
+
"5\tPSR2401718\t障害恒久対応\t45645.386111111111\t45695\n" +
|
|
225
|
+
"6\tPSR2401807\t障害恒久対応\t45664.546527777777\t調整中\t1/16リリースで調整中\n",
|
|
226
|
+
],
|
|
227
|
+
],
|
|
228
|
+
actual)
|
|
229
|
+
end
|
|
230
|
+
|
|
156
231
|
sub_test_case("invalid") do
|
|
157
232
|
def test_empty
|
|
158
233
|
messages = capture_log do
|
|
Binary file
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: chupa-text
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.3.
|
|
4
|
+
version: 1.3.6
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Sutou Kouhei
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date:
|
|
10
|
+
date: 2025-01-10 00:00:00.000000000 Z
|
|
11
11
|
dependencies:
|
|
12
12
|
- !ruby/object:Gem::Dependency
|
|
13
13
|
name: archive-zip
|
|
@@ -189,6 +189,7 @@ files:
|
|
|
189
189
|
- test/fixture/tar/top-level.tar
|
|
190
190
|
- test/fixture/tar/utf-8.tar
|
|
191
191
|
- test/fixture/xlsx/attributes.xlsx
|
|
192
|
+
- test/fixture/xlsx/complex-shared-strings.xlsx
|
|
192
193
|
- test/fixture/xlsx/empty.xlsx
|
|
193
194
|
- test/fixture/xlsx/multi-sheets.xlsx
|
|
194
195
|
- test/fixture/xlsx/not-shared-cell.xlsx
|
|
@@ -232,7 +233,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
232
233
|
- !ruby/object:Gem::Version
|
|
233
234
|
version: '0'
|
|
234
235
|
requirements: []
|
|
235
|
-
rubygems_version: 3.6.
|
|
236
|
+
rubygems_version: 3.6.2
|
|
236
237
|
specification_version: 4
|
|
237
238
|
summary: ChupaText is an extensible text extractor. You can plug your custom text
|
|
238
239
|
extractor in ChupaText. You can write your plugin by Ruby.
|