act_as_page_extractor 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +80 -71
- data/act_as_page_extractor.gemspec +1 -1
- data/lib/act_as_page_extractor/modules/tools.rb +4 -4
- data/lib/act_as_page_extractor/version.rb +1 -1
- data/spec/spec_helper.rb +6 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d0bd64f8e12d0c7bb3a75893738e30af616e4bcc5b958b18853b35363823b5ef
|
4
|
+
data.tar.gz: d87505b025bd924a545e2f6cbd9071958d65f993a234092f65b3cb7e108b16b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: af0708407f3b4546424e1666926c248cdb9fe0813ede2dd642d099836282d2f608d8d47edcd5cd513cef9b3ead231c192f6b815ec7721eb141b6820f561d0f30
|
7
|
+
data.tar.gz: 6a5969118ff6a6141aaaec8989e38670f75a817afa230636ed84f9f2a4e7c1f160ee569664f940e1355cfc74dff56e30370ca993fbb38d2a7139c17f56858acf
|
data/README.md
CHANGED
@@ -7,102 +7,111 @@ Library for extracting plain text from documents(files) for further processing (
|
|
7
7
|
|
8
8
|
Install appropriate tools before using:
|
9
9
|
|
10
|
-
|
11
|
-
|
10
|
+
```sh
|
11
|
+
sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
|
12
|
+
```
|
12
13
|
Add this line to your application's Gemfile:
|
13
14
|
|
14
|
-
|
15
|
-
|
15
|
+
```rb
|
16
|
+
gem 'act_as_page_extractor'
|
17
|
+
bundle
|
18
|
+
```
|
16
19
|
## Usage
|
17
20
|
|
18
|
-
For example, for model Document we need
|
21
|
+
For example, for model Document in the Rails framework we need run:
|
19
22
|
|
20
|
-
|
21
|
-
|
23
|
+
```sh
|
24
|
+
rails g act_as_page_extractor:migration Document category_id user_id
|
25
|
+
```
|
22
26
|
|
23
27
|
As a result we get two migration files:
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
end
|
45
|
-
|
46
|
-
add_index :extracted_pages, :document_id
|
47
|
-
add_index :extracted_pages, :category_id
|
48
|
-
add_index :extracted_pages, [:document_id, :category_id]
|
49
|
-
add_index :extracted_pages, [:document_id, :page_number]
|
50
|
-
end
|
28
|
+
```rb
|
29
|
+
class AddPageExtractorFields < ActiveRecord::Migration
|
30
|
+
def change
|
31
|
+
add_column :documents, :page_extraction_state, :string, default: ''
|
32
|
+
add_column :documents, :page_extraction_pages, :integer, default: 0
|
33
|
+
add_column :documents, :page_extraction_doctype, :string, default: ''
|
34
|
+
add_column :documents, :page_extraction_filesize, :string, default: ''
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
class CreateExtractedPages < ActiveRecord::Migration
|
39
|
+
def change
|
40
|
+
create_table :extracted_pages do |t|
|
41
|
+
t.text :page
|
42
|
+
t.integer :document_id
|
43
|
+
t.integer :category_id
|
44
|
+
t.integer :user_id
|
45
|
+
t.integer :page_number
|
46
|
+
|
47
|
+
t.timestamps null: false
|
51
48
|
end
|
52
49
|
|
50
|
+
add_index :extracted_pages, :document_id
|
51
|
+
add_index :extracted_pages, :category_id
|
52
|
+
add_index :extracted_pages, [:document_id, :category_id]
|
53
|
+
add_index :extracted_pages, [:document_id, :page_number]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
```
|
53
57
|
|
54
|
-
Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://
|
58
|
+
Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://docs-old.exoplatform.org/public/index.jsp?topic=%2FPLF41%2FPLFAdminGuide.Configuration.JODConverter.html))
|
55
59
|
|
56
60
|
Add to model next parameters for initializing:
|
57
61
|
|
58
|
-
|
59
|
-
|
62
|
+
```rb
|
63
|
+
class Document < ActiveRecord::Base
|
64
|
+
include ActAsPageExtractor
|
60
65
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
66
|
+
act_as_page_extractor options: {
|
67
|
+
document_class: 'Document',
|
68
|
+
save_as_pdf: true,
|
69
|
+
filename: :filename,
|
70
|
+
document_id: :document_id,
|
71
|
+
additional_fields: [:category_id, :user_id],
|
72
|
+
#file_storage: "/full/path/to/documents/storage",
|
73
|
+
#pdf_storage: "/full/path/to/extracted/pdf/storage"
|
74
|
+
}
|
70
75
|
|
71
|
-
|
72
|
-
|
76
|
+
has_many :extracted_pages, dependent: :destroy
|
77
|
+
end
|
78
|
+
```
|
73
79
|
|
74
80
|
Now our instance has few new methods:
|
75
81
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
82
|
+
```rb
|
83
|
+
document = Document.first
|
84
|
+
document.page_extract!
|
85
|
+
document.extracted_pages
|
86
|
+
document.pdf_path # if option 'save_as_pdf' is 'true'
|
80
87
|
|
81
|
-
|
82
|
-
|
88
|
+
# Access to pages
|
89
|
+
ExtractedPage.count
|
83
90
|
|
84
|
-
|
85
|
-
|
91
|
+
# Importing whole directory of documents
|
92
|
+
ActAsPageExtractor.import_files('/path/to/foler/with/documents')
|
86
93
|
|
87
|
-
|
88
|
-
|
94
|
+
# We can use cron for run the processing of all the new documents
|
95
|
+
ActAsPageExtractor.start_extraction
|
89
96
|
|
90
|
-
|
91
|
-
|
97
|
+
# Getting statistics information of all documents
|
98
|
+
ActAsPageExtractor.statistics
|
99
|
+
```
|
92
100
|
|
93
|
-
Parameters of initializing
|
101
|
+
Parameters of initializing **act_as_page_extractor**:
|
94
102
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
103
|
+
* **document_class** - name of model (e.g. Document)
|
104
|
+
* **save_as_pdf** - boolean [true, false] when we want save temporary pdf
|
105
|
+
* **filename** - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
|
106
|
+
* **document_id** - name for saving id
|
107
|
+
* **additional_fields** - additional fields that added to extracted page (e.g. for indexing, etc.)
|
108
|
+
* **file_storage** - path for saving tmp files (by default it is "public")
|
109
|
+
* **pdf_storage** - path for saving pdf (by default it is "public/uploads/extracted/pdf")
|
102
110
|
|
103
111
|
## Run tests
|
104
|
-
|
105
|
-
|
112
|
+
```sh
|
113
|
+
rspec
|
114
|
+
```
|
106
115
|
## Contributing
|
107
116
|
1. Fork it
|
108
117
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
@@ -115,5 +124,5 @@ https://github.com/phlowerteam
|
|
115
124
|
phlowerteam@gmail.com
|
116
125
|
|
117
126
|
## License
|
118
|
-
Copyright (c)
|
127
|
+
Copyright (c) 2024 PhlowerTeam
|
119
128
|
MIT License
|
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ['phlowerteam@gmail.com']
|
11
11
|
spec.description = %q{Library (Docsplit wrapper) for text extraction from pdf, doc/x, txt files with OpenOffice}
|
12
12
|
spec.summary = %q{Uses system calls}
|
13
|
-
spec.homepage = 'https://github.com/phlowerteam'
|
13
|
+
spec.homepage = 'https://github.com/phlowerteam/act_as_page_extractor'
|
14
14
|
spec.license = 'MIT'
|
15
15
|
|
16
16
|
spec.files = `git ls-files`.split($/)
|
@@ -1,17 +1,17 @@
|
|
1
1
|
require 'timeout'
|
2
2
|
|
3
3
|
module ActAsPageExtractor
|
4
|
+
# :nocov:
|
4
5
|
def timeout_wrapper
|
5
6
|
result = nil
|
6
7
|
begin
|
7
8
|
result = Timeout::timeout(60*5) { yield }
|
8
9
|
rescue
|
9
|
-
# :nocov:
|
10
10
|
ensure
|
11
|
-
# :nocov:
|
12
11
|
result
|
13
12
|
end
|
14
13
|
end
|
14
|
+
# :nocov:
|
15
15
|
|
16
16
|
def is_extracted
|
17
17
|
@pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
|
@@ -46,11 +46,11 @@ module ActAsPageExtractor
|
|
46
46
|
# ap "@copy_document_path"
|
47
47
|
# ap @copy_document_path
|
48
48
|
# ap "@document_path"
|
49
|
-
|
49
|
+
# ap @document_path
|
50
50
|
# ap "@pdf_path"
|
51
51
|
# ap @pdf_path
|
52
52
|
# ap "@pdf_pages"
|
53
|
-
|
53
|
+
# ap @pdf_pages
|
54
54
|
end
|
55
55
|
# :nocov:
|
56
56
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
|
-
|
1
|
+
unless ENV['SKIP_COVERAGE']
|
2
2
|
require 'simplecov'
|
3
|
-
SimpleCov.start 'rails'
|
3
|
+
SimpleCov.start 'rails' do
|
4
|
+
add_filter 'vendor'
|
5
|
+
end
|
6
|
+
SimpleCov.minimum_coverage 100
|
4
7
|
end
|
8
|
+
|
5
9
|
require 'rspec'
|
6
10
|
require 'support/models'
|
7
11
|
require 'act_as_page_extractor'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: act_as_page_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- PhlowerTeam
|
@@ -249,7 +249,7 @@ files:
|
|
249
249
|
- test/test-doc-3-pages.rtf
|
250
250
|
- test/test-doc-3-pages.txt
|
251
251
|
- test/test-doc-3-pages.wrong
|
252
|
-
homepage: https://github.com/phlowerteam
|
252
|
+
homepage: https://github.com/phlowerteam/act_as_page_extractor
|
253
253
|
licenses:
|
254
254
|
- MIT
|
255
255
|
metadata: {}
|