wp2txt 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d33a41cf46688679a14eb8c3eb16f6ed33ce9175c7f5b566c9f87998ba2c8401
4
- data.tar.gz: 7371e0f7b06b2f0846f01d66f461c7e106778adc6e686919302f0f29b1f80a9e
3
+ metadata.gz: 62f1e8d6ab1932f3ae3c34fb71930b7e73500c832481dcea6288742c38850a79
4
+ data.tar.gz: f0ff0a5488b635b828338d41029c5ad191a0c88282c0fa294a9facf2d93c055b
5
5
  SHA512:
6
- metadata.gz: cab8d9c27989387acc6dbbe052029d2205508ce10e38b8eedc111c822328d8eba551d603020684cbb3844a87b747f261a5959f711267acd96a3b97ccef4f6834
7
- data.tar.gz: 4de59be37d57ef3d14ae2304660e8dde069bdf645a7cff862026562b26327984f1be13840e9d6ec1f25110222367f71c84a0286b649d71fec0c13805c6b0a647
6
+ metadata.gz: 7bca85758e88d53dcd33fe43e83a251624f89329a2cb55ffb97b41141bcf8fe5ace7c48e3b8e49f5aa42f84724247cfe4ad376238a949e9154876d4d07469afe
7
+ data.tar.gz: de59399d5163afed2947e0802abf2e0365894d566c8a1f11823bc901d4948346e7af47d6fba558387f5af7e1301a6725a51a322ac1cd4810264dc3003e0729e2
data/.dockerignore ADDED
@@ -0,0 +1,8 @@
1
+ .git
2
+ .github
3
+ image
4
+ pkg
5
+ spec
6
+ .dockerignore
7
+ .gitignore
8
+ Gemfile.lock
@@ -0,0 +1,36 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+ workflow_dispatch:
7
+ schedule:
8
+ - cron: '42 5 * * *'
9
+
10
+ jobs:
11
+ test:
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ ruby: [ '3.1' ]
16
+
17
+ runs-on: ubuntu-latest
18
+ name: Ruby ${{matrix.ruby}}
19
+ container: ruby:${{matrix.ruby}}
20
+
21
+ steps:
22
+ - uses: actions/checkout@v3
23
+
24
+ - name: Show Ruby Version
25
+ run: ruby -v
26
+
27
+ - name: Install dependencies
28
+ run: bundle install
29
+
30
+ - name: Install rspec
31
+ run: gem install rspec
32
+
33
+ - name: Run tests
34
+ run: rspec
35
+
36
+
data/.gitignore CHANGED
@@ -18,4 +18,4 @@ tmp
18
18
  .DS_Store
19
19
  *.bak
20
20
  *.~
21
-
21
+ tags
data/.rubocop.yml ADDED
@@ -0,0 +1,80 @@
1
+ AllCops:
2
+ NewCops: disable
3
+ SuggestExtensions: false
4
+ TargetRubyVersion: 2.6
5
+
6
+ Documentation:
7
+ Enabled: false
8
+
9
+ Naming/AccessorMethodName:
10
+ Enabled: false
11
+
12
+ Naming/VariableNumber:
13
+ Enabled: false
14
+
15
+ Naming/FileName:
16
+ Enabled: false
17
+
18
+ Security/MarshalLoad:
19
+ Enabled: false
20
+
21
+ Security/Open:
22
+ Enabled: false
23
+
24
+ Layout/EndOfLine:
25
+ Enabled: False
26
+
27
+ Style/FormatStringToken:
28
+ Enabled: false
29
+
30
+ Style/ClassVars:
31
+ Enabled: false
32
+
33
+ Style/OptionalBooleanParameter:
34
+ Enabled: false
35
+
36
+ Style/StringConcatenation:
37
+ Enabled: false
38
+
39
+ Style/PerlBackrefs:
40
+ Enabled: false
41
+
42
+ Style/StringLiterals:
43
+ Enabled: false
44
+
45
+ Style/StringLiteralsInInterpolation:
46
+ Enabled: true
47
+ EnforcedStyle: double_quotes
48
+
49
+ Style/WordArray:
50
+ Enabled: false
51
+
52
+ Style/EvalWithLocation:
53
+ Enabled: false
54
+
55
+ Layout/LineLength:
56
+ Max: 400
57
+
58
+ Metrics/MethodLength:
59
+ Max: 200
60
+
61
+ Metrics/BlockLength:
62
+ Max: 200
63
+
64
+ Metrics/AbcSize:
65
+ Max: 200
66
+
67
+ Metrics/PerceivedComplexity:
68
+ Max: 60
69
+
70
+ Metrics/ClassLength:
71
+ Max: 800
72
+
73
+ Metrics/CyclomaticComplexity:
74
+ Max: 60
75
+
76
+ Metrics/ParameterLists:
77
+ Max: 8
78
+
79
+ Metrics/ModuleLength:
80
+ Max: 600
data/.solargraph.yml ADDED
@@ -0,0 +1,22 @@
1
+ ---
2
+ include:
3
+ - "**/*.rb"
4
+ exclude:
5
+ - spec/**/*
6
+ - test/**/*
7
+ - vendor/**/*
8
+ - ".bundle/**/*"
9
+ require: []
10
+ domains: []
11
+ reporters:
12
+ - rubocop
13
+ # - require_not_found
14
+ formatter:
15
+ rubocop:
16
+ cops: safe
17
+ except: []
18
+ only: []
19
+ extra_args: []
20
+ require_paths: []
21
+ plugins: []
22
+ max_files: 5000
data/Dockerfile ADDED
@@ -0,0 +1,20 @@
1
+ FROM ruby:3.1.3-alpine3.17
2
+
3
+ WORKDIR /wp2txt
4
+ COPY . ./
5
+ RUN rm -Rf wp2txt/Gemfile.lock
6
+
7
+ RUN apk update && \
8
+ apk upgrade && \
9
+ apk add --no-cache linux-headers libxml2-dev make gcc libc-dev bash && \
10
+ apk add --no-cache -t .build-packages --no-cache build-base curl-dev wget gcompat && \
11
+ bundle install -j4
12
+
13
+ RUN wget https://fossies.org/linux/privat/lbzip2-2.5.tar.gz -O lbzip2.tar.gz && \
14
+ tar -xvf lbzip2.tar.gz && cd lbzip2-2.5 && \
15
+ bash configure && make && make install && \
16
+ cd .. && rm -rf lbzip2*
17
+
18
+ WORKDIR /
19
+ ENV PATH $PATH:/wp2txt/bin
20
+ CMD ["bash"]
data/Gemfile CHANGED
@@ -1,4 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  source "http://rubygems.org"
2
4
 
3
- # Specify your gem's dependencies in wp2txt.gemspec
4
- gemspec
5
+ gem "htmlentities"
6
+ gem "nokogiri"
7
+ gem "optimist"
8
+ gem "parallel"
9
+ gem "pastel"
10
+ gem "ruby-progressbar"
11
+ gem "tty-spinner"
data/README.md CHANGED
@@ -6,20 +6,30 @@ A command-line toolkit to extract text content and category data from Wikipedia
6
6
 
7
7
  WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.
8
8
 
9
- **UPDATE (August 2022)**
9
+ ## Changelog
10
10
 
11
- 1. A new option `--category-only` has been added. When this option is enabled, only the title and category information of the article is extracted.
12
- 2. A new option `--summary-only` has been added. If this option is enabled, only the title, category information, and opening paragraphs of the article will be extracted.
13
- 3. Text conversion with the current version of WP2TXT is *more than 2x times faster* than the previous version due to parallel processing of multiple files (the rate of speedup depends on the CPU cores used for processing).
11
+ **December 2022**
12
+
13
+ - Docker images available via Docker Hub
14
+
15
+ **November 2022**
16
+
17
+ - Code added to suppress "Invalid byte sequence error" when an ilegal UTF-8 character is input.
18
+
19
+ **August 2022**
20
+
21
+ - A new option `--category-only` has been added. When this option is enabled, only the title and category information of the article is extracted.
22
+ - A new option `--summary-only` has been added. If this option is enabled, only the title, category information, and opening paragraphs of the article will be extracted.
23
+ - Text conversion with the current version of WP2TXT is *more than 2x times faster* than the previous version due to parallel processing of multiple files (the rate of speedup depends on the CPU cores used for processing).
14
24
 
15
25
  ## Screenshot
16
26
 
17
- <img src='https://raw.githubusercontent.com/yohasebe/wp2txt/master/image/screenshot.png' width="700" />
27
+ <img src='https://raw.githubusercontent.com/yohasebe/wp2txt/master/image/screenshot.png' width="800" />
18
28
 
19
- **Environment**
29
+ **Environment**
20
30
 
21
31
  - WP2TXT 1.0.1
22
- - MacBook Pro (2021 Apple M1 Pro)
32
+ - MacBook Pro (2021 Apple M1 Pro)
23
33
  - enwiki-20220720-pages-articles.xml.bz2 (19.98 GB)
24
34
 
25
35
  In the above environment, the process (decompression, splitting, extraction, and conversion) to obtain the plain text data of the English Wikipedia takes less than 1.5 hours.
@@ -32,9 +42,28 @@ In the above environment, the process (decompression, splitting, extraction, and
32
42
  - Allows extracting category information of the article
33
43
  - Allows extracting opening paragraphs of the article
34
44
 
35
- ## Preparation
45
+ ## Setting Up
46
+
47
+ ### WP2TXT on Docker
48
+
49
+ 1. Install [Docker Desktop](https://www.docker.com/products/docker-desktop/) (Mac/Windows/Linux)
50
+ 2. Execute `docker` command in a terminal:
51
+
52
+ ```shell
53
+ docker run -it -v /Users/me/localdata:/data yohasebe/wp2txt
54
+ ```
55
+
56
+ - Make sure to Replace `/Users/me/localdata` with the full path to the data directory in your local computer
57
+
58
+ 3. The Docker image will begin downloading and a bash prompt will appear when finished.
59
+ 4. The `wp2txt` command will be avalable anywhare in the Docker container. Use the `/data` directory as the location of the input dump files and the output text files.
60
+
61
+ **IMPORTANT:**
62
+
63
+ - Configure Docker Desktop resource settings (number of cores, amount of memory, etc.) to get the best performance possible.
64
+ - When running the `wp2txt` command inside a Docker container, be sure to set the output directory to somewhere in the mounted local directory specified by the `docker run` command.
36
65
 
37
- ### For MacOS / Linux/ WSL2
66
+ ### WP2TXT on MacOS and Linux
38
67
 
39
68
  WP2TXT requires that one of the following commands be installed on the system in order to decompress `bz2` files:
40
69
 
@@ -48,7 +77,7 @@ If you are using MacOS with Homebrew installed, you can install `lbzip2` with th
48
77
 
49
78
  $ brew install lbzip2
50
79
 
51
- ### For Windows
80
+ ### WP2TXT on Windows
52
81
 
53
82
  Install [Bzip2 for Windows](http://gnuwin32.sourceforge.net/packages/bzip2.htm) and set the path so that WP2TXT can use the bunzip2.exe command. Alternatively, you can extract the Wikipedia dump file in your own way and process the resulting XML file with WP2TXT.
54
83
 
@@ -184,11 +213,11 @@ The author will appreciate your mentioning one of these in your research.
184
213
  Or use this BibTeX entry:
185
214
 
186
215
  ```
187
- @misc{WP2TXT_2022,
216
+ @misc{wp2txt_2022,
188
217
  author = {Yoichiro Hasebe},
189
218
  title = {WP2TXT: A command-line toolkit to extract text content and category data from Wikipedia dump files},
190
- url = {https://github.com/yohasebe/wp2txt}
191
- year = {2022},
219
+ url = {https://github.com/yohasebe/wp2txt},
220
+ year = {2022}
192
221
  }
193
222
  ```
194
223
 
data/Rakefile CHANGED
@@ -1,9 +1,30 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require "bundler/gem_tasks"
2
- require 'rspec/core'
3
- require 'rspec/core/rake_task'
4
+ require "rspec/core"
5
+ require "rspec/core/rake_task"
6
+ require_relative "./lib/wp2txt/version"
7
+
8
+ class String
9
+ def strip_heredoc
10
+ gsub(/^#{scan(/^[ \t]*(?=\S)/).min}/, "")
11
+ end
12
+ end
4
13
 
5
14
  RSpec::Core::RakeTask.new(:spec) do |spec|
6
- spec.pattern = FileList['spec/**/*_spec.rb']
15
+ spec.pattern = FileList["spec/**/*_spec.rb"]
7
16
  end
8
17
 
9
- task :default => :spec
18
+ task default: :spec
19
+
20
+ desc "Push Docker images"
21
+ task :push do
22
+ sh <<-SCRIPT.strip_heredoc, { verbose: false }
23
+ /bin/bash -xeu <<'BASH'
24
+ # docker buildx create --name mybuilder
25
+ # docker buildx use mybuilder
26
+ # docker buildx inspect --bootstrap
27
+ docker buildx build --platform linux/amd64,linux/arm64 -t yohasebe/wp2txt:#{Wp2txt::VERSION} -t yohasebe/wp2txt:latest . --push
28
+ BASH
29
+ SCRIPT
30
+ end