simple_text_extract 3.0.7 → 3.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bad3e0fab883d324b24e9a06cacdceed53de65e51489be88557a3dcdc5bf39b6
4
- data.tar.gz: 2c8b53df878e404f90ca65eacae7ce25bc1663de7f138bed7c8ee744b13ee456
3
+ metadata.gz: b19c5d7467780a0ccb69fb18e29eb81c9acb7e16caf39a93ddeedaf325d541bd
4
+ data.tar.gz: 942c5a13853805d7e66feacf997e5a9181e69ea309a880916ca08653c0e3e8bd
5
5
  SHA512:
6
- metadata.gz: a2e1d6659ddfd9cf3252afc481db1077499ecfab0b4602b52555d6f5f18a29b7a7bbeb671009a0505be547a5ad553accbebf037df3f5a4aec1f9673fc4ff0069
7
- data.tar.gz: c3363629bf98d6fd55ac380830c2b7721903361a021e0ee10c5ddf33566be27199dda7e569cdc6821110a2200d3804d62f85d936a99165b58584df52c955afe1
6
+ metadata.gz: 42303612565dbfe9f23f699c3e554fb9509bbf7165e24d5b4b1fbea3f0ffe0bcd9f8723ac39861341f978b734ab671d17f6aa5a530e6fcf47dddfbe6e796acd1
7
+ data.tar.gz: c07deca30b24a70fef2baf7967bcfb411d4a60bdd24c624ce0a328e83be4ed558dca2628f468fa27b9d66e5e317413539d15d7b1bb551a8ec987d80eb69723a6
data/.rubocop.yml CHANGED
@@ -111,5 +111,8 @@ Lint/MissingSuper:
111
111
  Lint/ConstantDefinitionInBlock:
112
112
  Enabled: false
113
113
 
114
+ Lint/SuppressedException:
115
+ Enabled: false
116
+
114
117
  Style/SingleArgumentDig:
115
118
  Enabled: false
data/Gemfile.lock CHANGED
@@ -1,7 +1,9 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (3.0.7)
4
+ simple_text_extract (3.0.9)
5
+ base64
6
+ csv
5
7
  roo (~> 2.10.0)
6
8
  rubyzip (~> 2.3.2)
7
9
  spreadsheet (~> 1.3.0)
@@ -10,61 +12,62 @@ GEM
10
12
  remote: https://rubygems.org/
11
13
  specs:
12
14
  ast (2.4.2)
13
- bigdecimal (3.1.8)
15
+ base64 (0.2.0)
16
+ bigdecimal (3.1.9)
14
17
  coderay (1.1.3)
15
- json (2.7.2)
18
+ csv (3.3.2)
19
+ json (2.9.1)
16
20
  language_server-protocol (3.17.0.3)
17
- memory_profiler (1.0.2)
21
+ memory_profiler (1.1.0)
18
22
  method_source (1.1.0)
19
- minitest (5.24.1)
20
- mocha (2.4.0)
23
+ minitest (5.25.4)
24
+ mocha (2.7.1)
21
25
  ruby2_keywords (>= 0.0.5)
22
- nokogiri (1.16.6-arm64-darwin)
26
+ nokogiri (1.18.1-arm64-darwin)
23
27
  racc (~> 1.4)
24
- nokogiri (1.16.6-x86_64-linux)
28
+ nokogiri (1.18.1-x86_64-linux-gnu)
25
29
  racc (~> 1.4)
26
- parallel (1.25.1)
27
- parser (3.3.4.0)
30
+ parallel (1.26.3)
31
+ parser (3.3.6.0)
28
32
  ast (~> 2.4.1)
29
33
  racc
30
- pry (0.14.2)
34
+ pry (0.15.2)
31
35
  coderay (~> 1.1)
32
36
  method_source (~> 1.0)
33
- racc (1.8.0)
37
+ racc (1.8.1)
34
38
  rainbow (3.1.1)
35
39
  rake (13.2.1)
36
- regexp_parser (2.9.2)
37
- rexml (3.3.1)
38
- strscan
40
+ regexp_parser (2.10.0)
39
41
  roo (2.10.1)
40
42
  nokogiri (~> 1)
41
43
  rubyzip (>= 1.3.0, < 3.0.0)
42
- rubocop (1.65.0)
44
+ rubocop (1.69.2)
43
45
  json (~> 2.3)
44
46
  language_server-protocol (>= 3.17.0)
45
47
  parallel (~> 1.10)
46
48
  parser (>= 3.3.0.2)
47
49
  rainbow (>= 2.2.2, < 4.0)
48
- regexp_parser (>= 2.4, < 3.0)
49
- rexml (>= 3.2.5, < 4.0)
50
- rubocop-ast (>= 1.31.1, < 2.0)
50
+ regexp_parser (>= 2.9.3, < 3.0)
51
+ rubocop-ast (>= 1.36.2, < 2.0)
51
52
  ruby-progressbar (~> 1.7)
52
- unicode-display_width (>= 2.4.0, < 3.0)
53
- rubocop-ast (1.31.3)
53
+ unicode-display_width (>= 2.4.0, < 4.0)
54
+ rubocop-ast (1.37.0)
54
55
  parser (>= 3.3.1.0)
55
56
  ruby-ole (1.2.13.1)
56
57
  ruby-progressbar (1.13.0)
57
58
  ruby2_keywords (0.0.5)
58
59
  rubyzip (2.3.2)
59
- spreadsheet (1.3.1)
60
+ spreadsheet (1.3.3)
60
61
  bigdecimal
61
62
  ruby-ole
62
- strscan (3.1.0)
63
- unicode-display_width (2.5.0)
63
+ unicode-display_width (3.1.3)
64
+ unicode-emoji (~> 4.0, >= 4.0.4)
65
+ unicode-emoji (4.0.4)
64
66
 
65
67
  PLATFORMS
66
68
  arm64-darwin-21
67
69
  arm64-darwin-23
70
+ arm64-darwin-24
68
71
  x86_64-linux
69
72
 
70
73
  DEPENDENCIES
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "open3"
4
+
3
5
  class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
4
6
  def self.formatter(path)
5
7
  case path
@@ -78,7 +80,13 @@ class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
78
80
  def pdf_extract
79
81
  return nil if SimpleTextExtract.missing_dependency?("pdftotext")
80
82
 
81
- `pdftotext #{Shellwords.escape(file.path)} - 2>/dev/null`
83
+ stdout, stderr, status = Open3.capture3("pdftotext #{Shellwords.escape(file.path)} -")
84
+ if status.success?
85
+ stdout
86
+ else
87
+ warn "pdftotext failed: #{stderr}"
88
+ nil
89
+ end
82
90
  end
83
91
 
84
92
  def xlsx_extract
@@ -87,13 +95,14 @@ class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
87
95
  spreadsheet = Roo::Spreadsheet.open(file, only_visible_sheets: true)
88
96
 
89
97
  text = []
90
-
91
98
  spreadsheet.sheets.each_with_index do |name, i|
92
99
  text << "# Sheet Index: #{i}"
93
100
  text << "# Sheet Name: #{name}"
94
101
 
95
102
  spreadsheet.sheet(name)&.each_row_streaming do |row|
96
- text << row.map(&:to_s).join(" ")
103
+ text << row.map do |cell|
104
+ cell.value.to_s
105
+ end.join(" ")
97
106
  end
98
107
  end
99
108
 
@@ -116,7 +125,13 @@ class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
116
125
  def doc_extract
117
126
  return nil if SimpleTextExtract.missing_dependency?("antiword")
118
127
 
119
- `antiword #{Shellwords.escape(file.path)}`
128
+ stdout, stderr, status = Open3.capture3("antiword #{Shellwords.escape(file.path)}")
129
+ if status.success?
130
+ stdout
131
+ else
132
+ warn "antiword failed: #{stderr}"
133
+ nil
134
+ end
120
135
  end
121
136
 
122
137
  def docx_extract
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "3.0.7"
4
+ VERSION = "3.0.9"
5
5
  end
@@ -27,6 +27,8 @@ Gem::Specification.new do |spec|
27
27
  spec.requirements << "pdftotext/poppler"
28
28
 
29
29
  spec.add_dependency "roo", "~> 2.10.0"
30
- spec.add_dependency "spreadsheet", "~> 1.3.0"
31
30
  spec.add_dependency "rubyzip", "~> 2.3.2"
31
+ spec.add_dependency "spreadsheet", "~> 1.3.0"
32
+ spec.add_dependency "base64"
33
+ spec.add_dependency "csv"
32
34
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.7
4
+ version: 3.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-07-12 00:00:00.000000000 Z
11
+ date: 2024-12-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: roo
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: 2.10.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: rubyzip
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 2.3.2
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 2.3.2
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: spreadsheet
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -39,19 +53,33 @@ dependencies:
39
53
  - !ruby/object:Gem::Version
40
54
  version: 1.3.0
41
55
  - !ruby/object:Gem::Dependency
42
- name: rubyzip
56
+ name: base64
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
- - - "~>"
59
+ - - ">="
46
60
  - !ruby/object:Gem::Version
47
- version: 2.3.2
61
+ version: '0'
48
62
  type: :runtime
49
63
  prerelease: false
50
64
  version_requirements: !ruby/object:Gem::Requirement
51
65
  requirements:
52
- - - "~>"
66
+ - - ">="
53
67
  - !ruby/object:Gem::Version
54
- version: 2.3.2
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: csv
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
55
83
  description: Extract text from various file types before resorting to an OCR solution.
56
84
  email:
57
85
  - nickweiland@gmail.com
@@ -97,7 +125,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
97
125
  requirements:
98
126
  - antiword
99
127
  - pdftotext/poppler
100
- rubygems_version: 3.5.15
128
+ rubygems_version: 3.5.21
101
129
  signing_key:
102
130
  specification_version: 4
103
131
  summary: Extract text from various file types before resorting to an OCR solution.