wp2txt 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.dockerignore +8 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +1 -1
- data/.rubocop.yml +80 -0
- data/.solargraph.yml +22 -0
- data/Dockerfile +20 -0
- data/Gemfile +9 -2
- data/README.md +42 -13
- data/Rakefile +25 -4
- data/bin/wp2txt +177 -182
- data/lib/wp2txt/article.rb +70 -92
- data/lib/wp2txt/regex.rb +93 -0
- data/lib/wp2txt/utils.rb +172 -282
- data/lib/wp2txt/version.rb +3 -1
- data/lib/wp2txt.rb +119 -150
- data/spec/spec_helper.rb +4 -4
- data/spec/utils_spec.rb +101 -124
- data/wp2txt.gemspec +16 -18
- metadata +60 -11
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62f1e8d6ab1932f3ae3c34fb71930b7e73500c832481dcea6288742c38850a79
|
4
|
+
data.tar.gz: f0ff0a5488b635b828338d41029c5ad191a0c88282c0fa294a9facf2d93c055b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7bca85758e88d53dcd33fe43e83a251624f89329a2cb55ffb97b41141bcf8fe5ace7c48e3b8e49f5aa42f84724247cfe4ad376238a949e9154876d4d07469afe
|
7
|
+
data.tar.gz: de59399d5163afed2947e0802abf2e0365894d566c8a1f11823bc901d4948346e7af47d6fba558387f5af7e1301a6725a51a322ac1cd4810264dc3003e0729e2
|
data/.dockerignore
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
pull_request:
|
6
|
+
workflow_dispatch:
|
7
|
+
schedule:
|
8
|
+
- cron: '42 5 * * *'
|
9
|
+
|
10
|
+
jobs:
|
11
|
+
test:
|
12
|
+
strategy:
|
13
|
+
fail-fast: false
|
14
|
+
matrix:
|
15
|
+
ruby: [ '3.1' ]
|
16
|
+
|
17
|
+
runs-on: ubuntu-latest
|
18
|
+
name: Ruby ${{matrix.ruby}}
|
19
|
+
container: ruby:${{matrix.ruby}}
|
20
|
+
|
21
|
+
steps:
|
22
|
+
- uses: actions/checkout@v3
|
23
|
+
|
24
|
+
- name: Show Ruby Version
|
25
|
+
run: ruby -v
|
26
|
+
|
27
|
+
- name: Install dependencies
|
28
|
+
run: bundle install
|
29
|
+
|
30
|
+
- name: Install rspec
|
31
|
+
run: gem install rspec
|
32
|
+
|
33
|
+
- name: Run tests
|
34
|
+
run: rspec
|
35
|
+
|
36
|
+
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
AllCops:
|
2
|
+
NewCops: disable
|
3
|
+
SuggestExtensions: false
|
4
|
+
TargetRubyVersion: 2.6
|
5
|
+
|
6
|
+
Documentation:
|
7
|
+
Enabled: false
|
8
|
+
|
9
|
+
Naming/AccessorMethodName:
|
10
|
+
Enabled: false
|
11
|
+
|
12
|
+
Naming/VariableNumber:
|
13
|
+
Enabled: false
|
14
|
+
|
15
|
+
Naming/FileName:
|
16
|
+
Enabled: false
|
17
|
+
|
18
|
+
Security/MarshalLoad:
|
19
|
+
Enabled: false
|
20
|
+
|
21
|
+
Security/Open:
|
22
|
+
Enabled: false
|
23
|
+
|
24
|
+
Layout/EndOfLine:
|
25
|
+
Enabled: False
|
26
|
+
|
27
|
+
Style/FormatStringToken:
|
28
|
+
Enabled: false
|
29
|
+
|
30
|
+
Style/ClassVars:
|
31
|
+
Enabled: false
|
32
|
+
|
33
|
+
Style/OptionalBooleanParameter:
|
34
|
+
Enabled: false
|
35
|
+
|
36
|
+
Style/StringConcatenation:
|
37
|
+
Enabled: false
|
38
|
+
|
39
|
+
Style/PerlBackrefs:
|
40
|
+
Enabled: false
|
41
|
+
|
42
|
+
Style/StringLiterals:
|
43
|
+
Enabled: false
|
44
|
+
|
45
|
+
Style/StringLiteralsInInterpolation:
|
46
|
+
Enabled: true
|
47
|
+
EnforcedStyle: double_quotes
|
48
|
+
|
49
|
+
Style/WordArray:
|
50
|
+
Enabled: false
|
51
|
+
|
52
|
+
Style/EvalWithLocation:
|
53
|
+
Enabled: false
|
54
|
+
|
55
|
+
Layout/LineLength:
|
56
|
+
Max: 400
|
57
|
+
|
58
|
+
Metrics/MethodLength:
|
59
|
+
Max: 200
|
60
|
+
|
61
|
+
Metrics/BlockLength:
|
62
|
+
Max: 200
|
63
|
+
|
64
|
+
Metrics/AbcSize:
|
65
|
+
Max: 200
|
66
|
+
|
67
|
+
Metrics/PerceivedComplexity:
|
68
|
+
Max: 60
|
69
|
+
|
70
|
+
Metrics/ClassLength:
|
71
|
+
Max: 800
|
72
|
+
|
73
|
+
Metrics/CyclomaticComplexity:
|
74
|
+
Max: 60
|
75
|
+
|
76
|
+
Metrics/ParameterLists:
|
77
|
+
Max: 8
|
78
|
+
|
79
|
+
Metrics/ModuleLength:
|
80
|
+
Max: 600
|
data/.solargraph.yml
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
---
|
2
|
+
include:
|
3
|
+
- "**/*.rb"
|
4
|
+
exclude:
|
5
|
+
- spec/**/*
|
6
|
+
- test/**/*
|
7
|
+
- vendor/**/*
|
8
|
+
- ".bundle/**/*"
|
9
|
+
require: []
|
10
|
+
domains: []
|
11
|
+
reporters:
|
12
|
+
- rubocop
|
13
|
+
# - require_not_found
|
14
|
+
formatter:
|
15
|
+
rubocop:
|
16
|
+
cops: safe
|
17
|
+
except: []
|
18
|
+
only: []
|
19
|
+
extra_args: []
|
20
|
+
require_paths: []
|
21
|
+
plugins: []
|
22
|
+
max_files: 5000
|
data/Dockerfile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
FROM ruby:3.1.3-alpine3.17
|
2
|
+
|
3
|
+
WORKDIR /wp2txt
|
4
|
+
COPY . ./
|
5
|
+
RUN rm -Rf wp2txt/Gemfile.lock
|
6
|
+
|
7
|
+
RUN apk update && \
|
8
|
+
apk upgrade && \
|
9
|
+
apk add --no-cache linux-headers libxml2-dev make gcc libc-dev bash && \
|
10
|
+
apk add --no-cache -t .build-packages --no-cache build-base curl-dev wget gcompat && \
|
11
|
+
bundle install -j4
|
12
|
+
|
13
|
+
RUN wget https://fossies.org/linux/privat/lbzip2-2.5.tar.gz -O lbzip2.tar.gz && \
|
14
|
+
tar -xvf lbzip2.tar.gz && cd lbzip2-2.5 && \
|
15
|
+
bash configure && make && make install && \
|
16
|
+
cd .. && rm -rf lbzip2*
|
17
|
+
|
18
|
+
WORKDIR /
|
19
|
+
ENV PATH $PATH:/wp2txt/bin
|
20
|
+
CMD ["bash"]
|
data/Gemfile
CHANGED
@@ -1,4 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
source "http://rubygems.org"
|
2
4
|
|
3
|
-
|
4
|
-
|
5
|
+
gem "htmlentities"
|
6
|
+
gem "nokogiri"
|
7
|
+
gem "optimist"
|
8
|
+
gem "parallel"
|
9
|
+
gem "pastel"
|
10
|
+
gem "ruby-progressbar"
|
11
|
+
gem "tty-spinner"
|
data/README.md
CHANGED
@@ -6,20 +6,30 @@ A command-line toolkit to extract text content and category data from Wikipedia
|
|
6
6
|
|
7
7
|
WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML / compressed with Bzip2), removing MediaWiki markup and other metadata.
|
8
8
|
|
9
|
-
|
9
|
+
## Changelog
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
11
|
+
**December 2022**
|
12
|
+
|
13
|
+
- Docker images available via Docker Hub
|
14
|
+
|
15
|
+
**November 2022**
|
16
|
+
|
17
|
+
- Code added to suppress "Invalid byte sequence error" when an ilegal UTF-8 character is input.
|
18
|
+
|
19
|
+
**August 2022**
|
20
|
+
|
21
|
+
- A new option `--category-only` has been added. When this option is enabled, only the title and category information of the article is extracted.
|
22
|
+
- A new option `--summary-only` has been added. If this option is enabled, only the title, category information, and opening paragraphs of the article will be extracted.
|
23
|
+
- Text conversion with the current version of WP2TXT is *more than 2x times faster* than the previous version due to parallel processing of multiple files (the rate of speedup depends on the CPU cores used for processing).
|
14
24
|
|
15
25
|
## Screenshot
|
16
26
|
|
17
|
-
<img src='https://raw.githubusercontent.com/yohasebe/wp2txt/master/image/screenshot.png' width="
|
27
|
+
<img src='https://raw.githubusercontent.com/yohasebe/wp2txt/master/image/screenshot.png' width="800" />
|
18
28
|
|
19
|
-
**Environment**
|
29
|
+
**Environment**
|
20
30
|
|
21
31
|
- WP2TXT 1.0.1
|
22
|
-
- MacBook Pro (2021 Apple M1 Pro)
|
32
|
+
- MacBook Pro (2021 Apple M1 Pro)
|
23
33
|
- enwiki-20220720-pages-articles.xml.bz2 (19.98 GB)
|
24
34
|
|
25
35
|
In the above environment, the process (decompression, splitting, extraction, and conversion) to obtain the plain text data of the English Wikipedia takes less than 1.5 hours.
|
@@ -32,9 +42,28 @@ In the above environment, the process (decompression, splitting, extraction, and
|
|
32
42
|
- Allows extracting category information of the article
|
33
43
|
- Allows extracting opening paragraphs of the article
|
34
44
|
|
35
|
-
##
|
45
|
+
## Setting Up
|
46
|
+
|
47
|
+
### WP2TXT on Docker
|
48
|
+
|
49
|
+
1. Install [Docker Desktop](https://www.docker.com/products/docker-desktop/) (Mac/Windows/Linux)
|
50
|
+
2. Execute `docker` command in a terminal:
|
51
|
+
|
52
|
+
```shell
|
53
|
+
docker run -it -v /Users/me/localdata:/data yohasebe/wp2txt
|
54
|
+
```
|
55
|
+
|
56
|
+
- Make sure to Replace `/Users/me/localdata` with the full path to the data directory in your local computer
|
57
|
+
|
58
|
+
3. The Docker image will begin downloading and a bash prompt will appear when finished.
|
59
|
+
4. The `wp2txt` command will be avalable anywhare in the Docker container. Use the `/data` directory as the location of the input dump files and the output text files.
|
60
|
+
|
61
|
+
**IMPORTANT:**
|
62
|
+
|
63
|
+
- Configure Docker Desktop resource settings (number of cores, amount of memory, etc.) to get the best performance possible.
|
64
|
+
- When running the `wp2txt` command inside a Docker container, be sure to set the output directory to somewhere in the mounted local directory specified by the `docker run` command.
|
36
65
|
|
37
|
-
###
|
66
|
+
### WP2TXT on MacOS and Linux
|
38
67
|
|
39
68
|
WP2TXT requires that one of the following commands be installed on the system in order to decompress `bz2` files:
|
40
69
|
|
@@ -48,7 +77,7 @@ If you are using MacOS with Homebrew installed, you can install `lbzip2` with th
|
|
48
77
|
|
49
78
|
$ brew install lbzip2
|
50
79
|
|
51
|
-
###
|
80
|
+
### WP2TXT on Windows
|
52
81
|
|
53
82
|
Install [Bzip2 for Windows](http://gnuwin32.sourceforge.net/packages/bzip2.htm) and set the path so that WP2TXT can use the bunzip2.exe command. Alternatively, you can extract the Wikipedia dump file in your own way and process the resulting XML file with WP2TXT.
|
54
83
|
|
@@ -184,11 +213,11 @@ The author will appreciate your mentioning one of these in your research.
|
|
184
213
|
Or use this BibTeX entry:
|
185
214
|
|
186
215
|
```
|
187
|
-
@misc{
|
216
|
+
@misc{wp2txt_2022,
|
188
217
|
author = {Yoichiro Hasebe},
|
189
218
|
title = {WP2TXT: A command-line toolkit to extract text content and category data from Wikipedia dump files},
|
190
|
-
url = {https://github.com/yohasebe/wp2txt}
|
191
|
-
year = {2022}
|
219
|
+
url = {https://github.com/yohasebe/wp2txt},
|
220
|
+
year = {2022}
|
192
221
|
}
|
193
222
|
```
|
194
223
|
|
data/Rakefile
CHANGED
@@ -1,9 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require "bundler/gem_tasks"
|
2
|
-
require
|
3
|
-
require
|
4
|
+
require "rspec/core"
|
5
|
+
require "rspec/core/rake_task"
|
6
|
+
require_relative "./lib/wp2txt/version"
|
7
|
+
|
8
|
+
class String
|
9
|
+
def strip_heredoc
|
10
|
+
gsub(/^#{scan(/^[ \t]*(?=\S)/).min}/, "")
|
11
|
+
end
|
12
|
+
end
|
4
13
|
|
5
14
|
RSpec::Core::RakeTask.new(:spec) do |spec|
|
6
|
-
spec.pattern = FileList[
|
15
|
+
spec.pattern = FileList["spec/**/*_spec.rb"]
|
7
16
|
end
|
8
17
|
|
9
|
-
task :
|
18
|
+
task default: :spec
|
19
|
+
|
20
|
+
desc "Push Docker images"
|
21
|
+
task :push do
|
22
|
+
sh <<-SCRIPT.strip_heredoc, { verbose: false }
|
23
|
+
/bin/bash -xeu <<'BASH'
|
24
|
+
# docker buildx create --name mybuilder
|
25
|
+
# docker buildx use mybuilder
|
26
|
+
# docker buildx inspect --bootstrap
|
27
|
+
docker buildx build --platform linux/amd64,linux/arm64 -t yohasebe/wp2txt:#{Wp2txt::VERSION} -t yohasebe/wp2txt:latest . --push
|
28
|
+
BASH
|
29
|
+
SCRIPT
|
30
|
+
end
|