chupa-text 1.3.5 → 1.3.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8e24f007b2ce32dbfd3e2bc8a453d003d3cd9d528d097214e7c9a0a59a6e7427
4
- data.tar.gz: efbe5bc163d8f170f04074272071815002678ccbf7ec0f7d6282d7382aec5b2e
3
+ metadata.gz: 111930c1ba73f6eaae79fb538e34a51e72fd29768bca71ec89d43f041c34d960
4
+ data.tar.gz: 4051f7bc52f057a2e06cd7c7d23d3d8ce1755aae47b2352cc8d9e9e2dddfdfde
5
5
  SHA512:
6
- metadata.gz: 29dcae1ce19af6cade4f2cacbdd465f6f6af14ff566054c44fe021bfb8cdfd9d475a8fe5871387d46ade8063b30a0711f70470f677d405961f5a8e7eca04cfd9
7
- data.tar.gz: 638682d059e805c471b76ab0c1413e917acb690cee80b54b2855837203d029b8c0383c0e5e96d2d7676144cb79e048f0e4bd81b59b24e59c566c1b5cf0b3d731
6
+ metadata.gz: 9ef8a9c17bf65b1d1a7e26112390964738985c3c9b761dc37c157292a324045eba7565576523817fe738f6158aa1d0362260cf13d28360baec2aa8e9942f2fdc
7
+ data.tar.gz: 75f63640af7ad7a55d0925dcd37ee23e7bd64c83291d80f0796d6e18fff86f2b76340170c7fa1be572070a4bdee9502a8aa9dfbf7e9946fb4dbd06e6fdb1edbf
data/Rakefile CHANGED
@@ -1,6 +1,6 @@
1
- # -*- mode: ruby; coding: utf-8 -*-
1
+ # -*- ruby -*-
2
2
  #
3
- # Copyright (C) 2013 Kouhei Sutou <kou@clear-code.com>
3
+ # Copyright (C) 2013-2025 Sutou Kouhei <kou@clear-code.com>
4
4
  #
5
5
  # This library is free software; you can redistribute it and/or
6
6
  # modify it under the terms of the GNU Lesser General Public
@@ -46,3 +46,13 @@ desc "Run tests"
46
46
  task :test do
47
47
  ruby("test/run-test.rb")
48
48
  end
49
+
50
+ release_task = Rake.application["release"]
51
+ # We use Trusted Publishing.
52
+ release_task.prerequisites.delete("build")
53
+ release_task.prerequisites.delete("release:rubygem_push")
54
+ release_task_comment = release_task.comment
55
+ if release_task_comment
56
+ release_task.clear_comments
57
+ release_task.comment = release_task_comment.gsub(/ and build.*$/, "")
58
+ end
data/doc/text/news.md CHANGED
@@ -1,5 +1,17 @@
1
1
  # News
2
2
 
3
+ ## 1.3.6: 2025-01-10
4
+
5
+ ### Fixes
6
+
7
+ * xlsx: Fixed a bug that wrong text is extracted with complex shared
8
+ strings.
9
+ * Reported by Tomohisa Kusukawa.
10
+
11
+ ### Thanks
12
+
13
+ * Tomohisa Kusukawa
14
+
3
15
  ## 1.3.5: 2024-09-22
4
16
 
5
17
  ### Improvements
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019-2024 Sutou Kouhei <kou@clear-code.com>
1
+ # Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -49,7 +49,7 @@ module ChupaText
49
49
  def process_entry(entry, context)
50
50
  case entry.zip_path
51
51
  when "xl/sharedStrings.xml"
52
- extract_text(entry, context[:shared_strings])
52
+ extract_shared_strings(entry, context[:shared_strings])
53
53
  when "xl/workbook.xml"
54
54
  listener = WorkbookListener.new(context[:sheet_names])
55
55
  parse(entry.file_data, listener)
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -70,6 +70,11 @@ module ChupaText
70
70
  parse(entry.file_data, listener)
71
71
  end
72
72
 
73
+ def extract_shared_strings(entry, shared_strings)
74
+ listener = SharedStringsListener.new(shared_strings, @namespace_uri)
75
+ parse(entry.file_data, listener)
76
+ end
77
+
73
78
  def log_tag
74
79
  "[decomposer][office-open-xml]"
75
80
  end
@@ -90,26 +95,74 @@ module ChupaText
90
95
  end
91
96
 
92
97
  def end_element(uri, local_name, qname)
98
+ if uri == @target_uri
99
+ case local_name
100
+ when "p", "br"
101
+ @output << "\n"
102
+ when "t"
103
+ @in_target = false
104
+ end
105
+ end
106
+ end
107
+
108
+ def characters(text)
109
+ add_text(text)
110
+ end
111
+
112
+ def cdata(content)
113
+ add_text(content)
114
+ end
115
+
116
+ private
117
+ def add_text(text)
118
+ return unless @in_target
119
+ @output << text
120
+ end
121
+ end
122
+
123
+ class SharedStringsListener < SAXListener
124
+ def initialize(output, target_uri)
125
+ @output = output
126
+ @target_uri = target_uri
127
+ @tag_stack = []
93
128
  @in_target = false
129
+ @current_text = +""
130
+ end
131
+
132
+ def start_element(uri, local_name, qname, attributes)
133
+ @tag_stack << local_name
134
+
135
+ return unless uri == @target_uri
136
+ case local_name
137
+ when "t"
138
+ @in_target = true
139
+ @current_text = +""
140
+ end
141
+ end
94
142
 
143
+ def end_element(uri, local_name, qname)
95
144
  return unless uri == @target_uri
96
145
  case local_name
97
- when "p", "br"
98
- @output << "\n"
146
+ when "t"
147
+ add_text(@current_text)
148
+ @in_target = false
99
149
  end
150
+ ensure
151
+ @tag_stack.pop
100
152
  end
101
153
 
102
154
  def characters(text)
103
- add_text(text)
155
+ @current_text << text if @in_target
104
156
  end
105
157
 
106
158
  def cdata(content)
107
- add_text(content)
159
+ @current_text << content if @in_target
108
160
  end
109
161
 
110
162
  private
111
163
  def add_text(text)
112
- return unless @in_target
164
+ parent_tag = @tag_stack[-2]
165
+ return unless parent_tag == "si"
113
166
  @output << text
114
167
  end
115
168
  end
@@ -15,5 +15,5 @@
15
15
  # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
16
 
17
17
  module ChupaText
18
- VERSION = "1.3.5"
18
+ VERSION = "1.3.6"
19
19
  end
@@ -1,4 +1,4 @@
1
- # Copyright (C) 2019 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright (C) 2019-2025 Sutou Kouhei <kou@clear-code.com>
2
2
  #
3
3
  # This library is free software; you can redistribute it and/or
4
4
  # modify it under the terms of the GNU Lesser General Public
@@ -153,6 +153,81 @@ class TestDecomposersOfficeOpenXMLWorkbook < Test::Unit::TestCase
153
153
  end
154
154
  end
155
155
 
156
+ def test_complex_shared_strings
157
+ path = fixture_path("xlsx", "complex-shared-strings.xlsx")
158
+ actual = decompose(path).collect do |data|
159
+ [
160
+ data["index"],
161
+ data["name"],
162
+ data.body,
163
+ ]
164
+ end
165
+ assert_equal([
166
+ [nil, nil, ""],
167
+ [
168
+ 0,
169
+ "新規",
170
+ "No\t案件番号\t開始日\t期日\tステータス\t備考\n" +
171
+ "1\t-\t45664\t45672\t対応中\n" +
172
+ "2\t-\t45664\t45672\t対応中\n" +
173
+ "3\t-\t45664\t45672\t対応中\n" +
174
+ "4\t-\t45664\t45666\t対応中\n" +
175
+ "5\t-\t45664\t45666\t対応中\n" +
176
+ "6\t-\t45663\t45665\t承認待ち\n" +
177
+ "7\t-\t45660\t45665\t承認待ち\n" +
178
+ "8\t-\t45653\t45663\t承認待ち\n" +
179
+ "9\t-\t45653\t45663\t承認待ち\n" +
180
+ "10\tPSR2401770\t45652\t45666\t対応中\n",
181
+ ],
182
+ [
183
+ 1,
184
+ "全体",
185
+ "No\t案件番号\t開始日\t期日\tステータス\n" +
186
+ "1\tPSR2401564\t45617\t45726\t対応中\n" +
187
+ "2\tPSR2401194\t45553\t45716\t対応中\n" +
188
+ "3\t-\t45664\t45672\t対応中\n" +
189
+ "4\t-\t45664\t45672\t対応中\n" +
190
+ "5\t-\t45664\t45672\t対応中\n" +
191
+ "6\t-\t45645\t45672\t対応中\n" +
192
+ "7\tPSR2401746\t45649\t45671\t対応中\n" +
193
+ "8\t-\t45640\t45667\t対応中\n" +
194
+ "9\t-\t45635\t45667\t対応中\n" +
195
+ "10\tPSR2401605\t45623\t45667\t対応中\n" +
196
+ "11\t-\t45664\t45666\t対応中\n" +
197
+ "12\t-\t45664\t45666\t対応中\n" +
198
+ "13\tPSR2401770\t45652\t45666\t対応中\n" +
199
+ "14\t-\t45645\t45665\t対応中\n" +
200
+ "15\tPSR2401609\t45624\t45666\t対応中\n",
201
+ ],
202
+ [
203
+ 2,
204
+ "案件",
205
+ "No\t案件番号\t開始日\t対応完了時期想定\n" +
206
+ "1\tPSR2401244\t45561.40347222222\t45744\n" +
207
+ "2\tPSR2401592\t45621.598611111112\t45698\n" +
208
+ "3\tPSR2401682\t45638.40902777778\t45688\n" +
209
+ "4\tPSR2401706\t45643.383333333331\t45671\n" +
210
+ "5\tPSR2401779\t45653.490277777775\t45671\n" +
211
+ "6\tPSR2401805\t45664.436805555553\t調整中\n" +
212
+ "7\tPSR2400677\t45455.588194444441\t45651\t完了\n" +
213
+ "8\tPSR2401666\t45636.405555555553\t45653\t完了\n" +
214
+ "9\tPSR2401714\t45644.630555555559\t45652\t完了\n",
215
+ ],
216
+ [
217
+ 3,
218
+ "障害恒久対応・改善対応",
219
+ "No\t案件番号\t分類\t開始日\t対応完了時期想定\n" +
220
+ "1\tPSR2401334\t改善対応\t45576.411805555559\t45688\n" +
221
+ "2\tPSR2401335\t改善対応\t45576.415277777778\t45688\n" +
222
+ "3\tPSR2401410\t改善対応\t45588.428472222222\t調整中\n" +
223
+ "4\tPSR2401411\t改善対応\t45588.432638888888\t調整中\n" +
224
+ "5\tPSR2401718\t障害恒久対応\t45645.386111111111\t45695\n" +
225
+ "6\tPSR2401807\t障害恒久対応\t45664.546527777777\t調整中\t1/16リリースで調整中\n",
226
+ ],
227
+ ],
228
+ actual)
229
+ end
230
+
156
231
  sub_test_case("invalid") do
157
232
  def test_empty
158
233
  messages = capture_log do
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: chupa-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.5
4
+ version: 1.3.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sutou Kouhei
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2024-09-22 00:00:00.000000000 Z
10
+ date: 2025-01-10 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: archive-zip
@@ -189,6 +189,7 @@ files:
189
189
  - test/fixture/tar/top-level.tar
190
190
  - test/fixture/tar/utf-8.tar
191
191
  - test/fixture/xlsx/attributes.xlsx
192
+ - test/fixture/xlsx/complex-shared-strings.xlsx
192
193
  - test/fixture/xlsx/empty.xlsx
193
194
  - test/fixture/xlsx/multi-sheets.xlsx
194
195
  - test/fixture/xlsx/not-shared-cell.xlsx
@@ -232,7 +233,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
232
233
  - !ruby/object:Gem::Version
233
234
  version: '0'
234
235
  requirements: []
235
- rubygems_version: 3.6.0.dev
236
+ rubygems_version: 3.6.2
236
237
  specification_version: 4
237
238
  summary: ChupaText is an extensible text extractor. You can plug your custom text
238
239
  extractor in ChupaText. You can write your plugin by Ruby.