simple_text_extract 3.0.8 → 3.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 44412a154f56b1100983582f15f3d56800edf040dfac98a5ae9fb9b3fa2d6401
4
- data.tar.gz: a6c5b2d94e13be12c71d3b4a66d0c2aef8fc6b7d7084a4c2b2d490e49954916d
3
+ metadata.gz: e1d20d6e5aa53a4953677a83fdf8bae7f7e3f7e5c8d6fc9ca04ad9ff042ab271
4
+ data.tar.gz: 5dda39b4e17088d3a4f46b48e503ff3b1332dbc80719a7e6ae1a8db8e958df7c
5
5
  SHA512:
6
- metadata.gz: 88e630c8aeee64240196e96c93675ba15a1a7c2f3c2c1a67ab51a05d35dc404a11574f61995628b2b399cd4d42570d40beddb55c0b6cda0bccece97415d995a7
7
- data.tar.gz: 42ff8b4a4702c52702ac95b2bab6b79495384cdec6d9f68011fb04bfca778cf91442c010ab67ac0a1ee59d27148b00be43221fbd20392d6e96f53d4ec5ae7788
6
+ metadata.gz: 7e1b6c45cce0221573f8eced0701388d33cf5cce20c72d8596381e23a52aa7807a912e03ada9d08dadbb97a3655922f5b5def8bb2d857cffbbc7ba158808a671
7
+ data.tar.gz: 165580f7acb175b62be523d361ae8575492510636562b612b4f7b9358d36176f397df2e558058f07338fc07dba360449a8c7bd88465f6c80d73a4204e8b68164
@@ -12,6 +12,8 @@ jobs:
12
12
  fail-fast: false
13
13
  matrix:
14
14
  include:
15
+ - ruby: 3.4
16
+ gemfile: Gemfile
15
17
  - ruby: 3.3
16
18
  gemfile: Gemfile
17
19
  - ruby: 3.2
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 3.3.4
1
+ 3.4.2
data/Gemfile.lock CHANGED
@@ -1,70 +1,79 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- simple_text_extract (3.0.8)
5
- roo (~> 2.10.0)
6
- rubyzip (~> 2.3.2)
7
- spreadsheet (~> 1.3.0)
4
+ simple_text_extract (3.0.10)
5
+ base64
6
+ csv
7
+ roo (~> 2.10)
8
+ rubyzip (~> 2.3)
9
+ spreadsheet (~> 1.3)
8
10
 
9
11
  GEM
10
12
  remote: https://rubygems.org/
11
13
  specs:
12
- ast (2.4.2)
13
- bigdecimal (3.1.8)
14
+ ast (2.4.3)
15
+ base64 (0.2.0)
16
+ bigdecimal (3.1.9)
14
17
  coderay (1.1.3)
15
- json (2.7.2)
16
- language_server-protocol (3.17.0.3)
17
- memory_profiler (1.0.2)
18
+ csv (3.3.4)
19
+ json (2.10.2)
20
+ language_server-protocol (3.17.0.4)
21
+ lint_roller (1.1.0)
22
+ logger (1.7.0)
23
+ memory_profiler (1.1.0)
18
24
  method_source (1.1.0)
19
- minitest (5.24.1)
20
- mocha (2.4.0)
25
+ minitest (5.25.5)
26
+ mocha (2.7.1)
21
27
  ruby2_keywords (>= 0.0.5)
22
- nokogiri (1.16.6-arm64-darwin)
28
+ nokogiri (1.18.8-arm64-darwin)
23
29
  racc (~> 1.4)
24
- nokogiri (1.16.6-x86_64-linux)
30
+ nokogiri (1.18.8-x86_64-linux-gnu)
25
31
  racc (~> 1.4)
26
- parallel (1.25.1)
27
- parser (3.3.4.0)
32
+ parallel (1.27.0)
33
+ parser (3.3.8.0)
28
34
  ast (~> 2.4.1)
29
35
  racc
30
- pry (0.14.2)
36
+ prism (1.4.0)
37
+ pry (0.15.2)
31
38
  coderay (~> 1.1)
32
39
  method_source (~> 1.0)
33
- racc (1.8.0)
40
+ racc (1.8.1)
34
41
  rainbow (3.1.1)
35
42
  rake (13.2.1)
36
- regexp_parser (2.9.2)
37
- rexml (3.3.1)
38
- strscan
43
+ regexp_parser (2.10.0)
39
44
  roo (2.10.1)
40
45
  nokogiri (~> 1)
41
46
  rubyzip (>= 1.3.0, < 3.0.0)
42
- rubocop (1.65.0)
47
+ rubocop (1.75.2)
43
48
  json (~> 2.3)
44
- language_server-protocol (>= 3.17.0)
49
+ language_server-protocol (~> 3.17.0.2)
50
+ lint_roller (~> 1.1.0)
45
51
  parallel (~> 1.10)
46
52
  parser (>= 3.3.0.2)
47
53
  rainbow (>= 2.2.2, < 4.0)
48
- regexp_parser (>= 2.4, < 3.0)
49
- rexml (>= 3.2.5, < 4.0)
50
- rubocop-ast (>= 1.31.1, < 2.0)
54
+ regexp_parser (>= 2.9.3, < 3.0)
55
+ rubocop-ast (>= 1.44.0, < 2.0)
51
56
  ruby-progressbar (~> 1.7)
52
- unicode-display_width (>= 2.4.0, < 3.0)
53
- rubocop-ast (1.31.3)
54
- parser (>= 3.3.1.0)
57
+ unicode-display_width (>= 2.4.0, < 4.0)
58
+ rubocop-ast (1.44.1)
59
+ parser (>= 3.3.7.2)
60
+ prism (~> 1.4)
55
61
  ruby-ole (1.2.13.1)
56
62
  ruby-progressbar (1.13.0)
57
63
  ruby2_keywords (0.0.5)
58
- rubyzip (2.3.2)
59
- spreadsheet (1.3.1)
64
+ rubyzip (2.4.1)
65
+ spreadsheet (1.3.4)
60
66
  bigdecimal
67
+ logger
61
68
  ruby-ole
62
- strscan (3.1.0)
63
- unicode-display_width (2.5.0)
69
+ unicode-display_width (3.1.4)
70
+ unicode-emoji (~> 4.0, >= 4.0.4)
71
+ unicode-emoji (4.0.4)
64
72
 
65
73
  PLATFORMS
66
74
  arm64-darwin-21
67
75
  arm64-darwin-23
76
+ arm64-darwin-24
68
77
  x86_64-linux
69
78
 
70
79
  DEPENDENCIES
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "open3"
4
+
3
5
  class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
4
6
  def self.formatter(path)
5
7
  case path
@@ -78,7 +80,13 @@ class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
78
80
  def pdf_extract
79
81
  return nil if SimpleTextExtract.missing_dependency?("pdftotext")
80
82
 
81
- `pdftotext #{Shellwords.escape(file.path)} - 2>/dev/null`
83
+ stdout, stderr, status = Open3.capture3("pdftotext #{Shellwords.escape(file.path)} -")
84
+ if status.success?
85
+ stdout
86
+ else
87
+ warn "pdftotext failed: #{stderr}"
88
+ nil
89
+ end
82
90
  end
83
91
 
84
92
  def xlsx_extract
@@ -117,7 +125,13 @@ class SimpleTextExtract::Extract # rubocop:disable Metrics/ClassLength
117
125
  def doc_extract
118
126
  return nil if SimpleTextExtract.missing_dependency?("antiword")
119
127
 
120
- `antiword #{Shellwords.escape(file.path)}`
128
+ stdout, stderr, status = Open3.capture3("antiword #{Shellwords.escape(file.path)}")
129
+ if status.success?
130
+ stdout
131
+ else
132
+ warn "antiword failed: #{stderr}"
133
+ nil
134
+ end
121
135
  end
122
136
 
123
137
  def docx_extract
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SimpleTextExtract
4
- VERSION = "3.0.8"
4
+ VERSION = "3.0.10"
5
5
  end
@@ -26,7 +26,9 @@ Gem::Specification.new do |spec|
26
26
  spec.requirements << "antiword"
27
27
  spec.requirements << "pdftotext/poppler"
28
28
 
29
- spec.add_dependency "roo", "~> 2.10.0"
30
- spec.add_dependency "rubyzip", "~> 2.3.2"
31
- spec.add_dependency "spreadsheet", "~> 1.3.0"
29
+ spec.add_dependency "roo", "~> 2.10"
30
+ spec.add_dependency "rubyzip", "~> 2.3"
31
+ spec.add_dependency "spreadsheet", "~> 1.3"
32
+ spec.add_dependency "base64"
33
+ spec.add_dependency "csv"
32
34
  end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: simple_text_extract
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.8
4
+ version: 3.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nick Weiland
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2024-07-12 00:00:00.000000000 Z
10
+ date: 2025-04-22 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: roo
@@ -16,42 +15,70 @@ dependencies:
16
15
  requirements:
17
16
  - - "~>"
18
17
  - !ruby/object:Gem::Version
19
- version: 2.10.0
18
+ version: '2.10'
20
19
  type: :runtime
21
20
  prerelease: false
22
21
  version_requirements: !ruby/object:Gem::Requirement
23
22
  requirements:
24
23
  - - "~>"
25
24
  - !ruby/object:Gem::Version
26
- version: 2.10.0
25
+ version: '2.10'
27
26
  - !ruby/object:Gem::Dependency
28
27
  name: rubyzip
29
28
  requirement: !ruby/object:Gem::Requirement
30
29
  requirements:
31
30
  - - "~>"
32
31
  - !ruby/object:Gem::Version
33
- version: 2.3.2
32
+ version: '2.3'
34
33
  type: :runtime
35
34
  prerelease: false
36
35
  version_requirements: !ruby/object:Gem::Requirement
37
36
  requirements:
38
37
  - - "~>"
39
38
  - !ruby/object:Gem::Version
40
- version: 2.3.2
39
+ version: '2.3'
41
40
  - !ruby/object:Gem::Dependency
42
41
  name: spreadsheet
43
42
  requirement: !ruby/object:Gem::Requirement
44
43
  requirements:
45
44
  - - "~>"
46
45
  - !ruby/object:Gem::Version
47
- version: 1.3.0
46
+ version: '1.3'
48
47
  type: :runtime
49
48
  prerelease: false
50
49
  version_requirements: !ruby/object:Gem::Requirement
51
50
  requirements:
52
51
  - - "~>"
53
52
  - !ruby/object:Gem::Version
54
- version: 1.3.0
53
+ version: '1.3'
54
+ - !ruby/object:Gem::Dependency
55
+ name: base64
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ type: :runtime
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '0'
68
+ - !ruby/object:Gem::Dependency
69
+ name: csv
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ type: :runtime
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: '0'
55
82
  description: Extract text from various file types before resorting to an OCR solution.
56
83
  email:
57
84
  - nickweiland@gmail.com
@@ -59,7 +86,6 @@ executables: []
59
86
  extensions: []
60
87
  extra_rdoc_files: []
61
88
  files:
62
- - "-"
63
89
  - ".github/workflows/test.yml"
64
90
  - ".gitignore"
65
91
  - ".rubocop.yml"
@@ -80,7 +106,6 @@ homepage: https://github.com/weilandia/simple_text_extract
80
106
  licenses:
81
107
  - MIT
82
108
  metadata: {}
83
- post_install_message:
84
109
  rdoc_options: []
85
110
  require_paths:
86
111
  - lib
@@ -97,8 +122,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
97
122
  requirements:
98
123
  - antiword
99
124
  - pdftotext/poppler
100
- rubygems_version: 3.5.15
101
- signing_key:
125
+ rubygems_version: 3.6.2
102
126
  specification_version: 4
103
127
  summary: Extract text from various file types before resorting to an OCR solution.
104
128
  test_files: []
data/- DELETED
@@ -1 +0,0 @@
1
- Test