wp2txt 1.0.2 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.dockerignore +8 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +1 -1
- data/.rubocop.yml +80 -0
- data/.solargraph.yml +22 -0
- data/Dockerfile +20 -0
- data/Gemfile +9 -2
- data/README.md +34 -6
- data/Rakefile +25 -4
- data/bin/wp2txt +177 -182
- data/lib/wp2txt/article.rb +70 -92
- data/lib/wp2txt/regex.rb +93 -0
- data/lib/wp2txt/utils.rb +159 -270
- data/lib/wp2txt/version.rb +3 -1
- data/lib/wp2txt.rb +129 -155
- data/spec/spec_helper.rb +4 -4
- data/spec/utils_spec.rb +101 -124
- data/wp2txt.gemspec +16 -18
- metadata +60 -12
- data/tags +0 -58
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0bcba2c84286504ae628176aad55dbdea05889dfaa7f471cf080ae933691cffc
|
4
|
+
data.tar.gz: f83c63d7c6e91270da1da2aed54ab6e5c352c5695340ccd6378fdd20c43fc332
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ae0eae028a98d4299a0e93278220b991e53e13deb80b88cce2971cd889d769808305e0d3aa8ee4e73af0cc55f07f27c0cc6a9f0d440e4693a410ba7d0a6333ba
|
7
|
+
data.tar.gz: 25bff247bf80b4a0b5ff785ed51b60e3e21e6a3ca5e0bfeace9961df5060832ae047c3fa606a3267f9f770c253fd816176372c4d2f7ba73999d5317ea59933e6
|
data/.dockerignore
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
pull_request:
|
6
|
+
workflow_dispatch:
|
7
|
+
schedule:
|
8
|
+
- cron: '42 5 * * *'
|
9
|
+
|
10
|
+
jobs:
|
11
|
+
test:
|
12
|
+
strategy:
|
13
|
+
fail-fast: false
|
14
|
+
matrix:
|
15
|
+
ruby: [ '3.1' ]
|
16
|
+
|
17
|
+
runs-on: ubuntu-latest
|
18
|
+
name: Ruby ${{matrix.ruby}}
|
19
|
+
container: ruby:${{matrix.ruby}}
|
20
|
+
|
21
|
+
steps:
|
22
|
+
- uses: actions/checkout@v3
|
23
|
+
|
24
|
+
- name: Show Ruby Version
|
25
|
+
run: ruby -v
|
26
|
+
|
27
|
+
- name: Install dependencies
|
28
|
+
run: bundle install
|
29
|
+
|
30
|
+
- name: Install rspec
|
31
|
+
run: gem install rspec
|
32
|
+
|
33
|
+
- name: Run tests
|
34
|
+
run: rspec
|
35
|
+
|
36
|
+
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
AllCops:
|
2
|
+
NewCops: disable
|
3
|
+
SuggestExtensions: false
|
4
|
+
TargetRubyVersion: 2.6
|
5
|
+
|
6
|
+
Documentation:
|
7
|
+
Enabled: false
|
8
|
+
|
9
|
+
Naming/AccessorMethodName:
|
10
|
+
Enabled: false
|
11
|
+
|
12
|
+
Naming/VariableNumber:
|
13
|
+
Enabled: false
|
14
|
+
|
15
|
+
Naming/FileName:
|
16
|
+
Enabled: false
|
17
|
+
|
18
|
+
Security/MarshalLoad:
|
19
|
+
Enabled: false
|
20
|
+
|
21
|
+
Security/Open:
|
22
|
+
Enabled: false
|
23
|
+
|
24
|
+
Layout/EndOfLine:
|
25
|
+
Enabled: False
|
26
|
+
|
27
|
+
Style/FormatStringToken:
|
28
|
+
Enabled: false
|
29
|
+
|
30
|
+
Style/ClassVars:
|
31
|
+
Enabled: false
|
32
|
+
|
33
|
+
Style/OptionalBooleanParameter:
|
34
|
+
Enabled: false
|
35
|
+
|
36
|
+
Style/StringConcatenation:
|
37
|
+
Enabled: false
|
38
|
+
|
39
|
+
Style/PerlBackrefs:
|
40
|
+
Enabled: false
|
41
|
+
|
42
|
+
Style/StringLiterals:
|
43
|
+
Enabled: false
|
44
|
+
|
45
|
+
Style/StringLiteralsInInterpolation:
|
46
|
+
Enabled: true
|
47
|
+
EnforcedStyle: double_quotes
|
48
|
+
|
49
|
+
Style/WordArray:
|
50
|
+
Enabled: false
|
51
|
+
|
52
|
+
Style/EvalWithLocation:
|
53
|
+
Enabled: false
|
54
|
+
|
55
|
+
Layout/LineLength:
|
56
|
+
Max: 400
|
57
|
+
|
58
|
+
Metrics/MethodLength:
|
59
|
+
Max: 200
|
60
|
+
|
61
|
+
Metrics/BlockLength:
|
62
|
+
Max: 200
|
63
|
+
|
64
|
+
Metrics/AbcSize:
|
65
|
+
Max: 200
|
66
|
+
|
67
|
+
Metrics/PerceivedComplexity:
|
68
|
+
Max: 60
|
69
|
+
|
70
|
+
Metrics/ClassLength:
|
71
|
+
Max: 800
|
72
|
+
|
73
|
+
Metrics/CyclomaticComplexity:
|
74
|
+
Max: 60
|
75
|
+
|
76
|
+
Metrics/ParameterLists:
|
77
|
+
Max: 8
|
78
|
+
|
79
|
+
Metrics/ModuleLength:
|
80
|
+
Max: 600
|
data/.solargraph.yml
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
---
|
2
|
+
include:
|
3
|
+
- "**/*.rb"
|
4
|
+
exclude:
|
5
|
+
- spec/**/*
|
6
|
+
- test/**/*
|
7
|
+
- vendor/**/*
|
8
|
+
- ".bundle/**/*"
|
9
|
+
require: []
|
10
|
+
domains: []
|
11
|
+
reporters:
|
12
|
+
- rubocop
|
13
|
+
# - require_not_found
|
14
|
+
formatter:
|
15
|
+
rubocop:
|
16
|
+
cops: safe
|
17
|
+
except: []
|
18
|
+
only: []
|
19
|
+
extra_args: []
|
20
|
+
require_paths: []
|
21
|
+
plugins: []
|
22
|
+
max_files: 5000
|
data/Dockerfile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
FROM ruby:3.1.3-alpine3.17
|
2
|
+
|
3
|
+
WORKDIR /wp2txt
|
4
|
+
COPY . ./
|
5
|
+
RUN rm -Rf wp2txt/Gemfile.lock
|
6
|
+
|
7
|
+
RUN apk update && \
|
8
|
+
apk upgrade && \
|
9
|
+
apk add --no-cache linux-headers libxml2-dev make gcc libc-dev bash && \
|
10
|
+
apk add --no-cache -t .build-packages --no-cache build-base curl-dev wget gcompat && \
|
11
|
+
bundle install -j4
|
12
|
+
|
13
|
+
RUN wget https://fossies.org/linux/privat/lbzip2-2.5.tar.gz -O lbzip2.tar.gz && \
|
14
|
+
tar -xvf lbzip2.tar.gz && cd lbzip2-2.5 && \
|
15
|
+
bash configure && make && make install && \
|
16
|
+
cd .. && rm -rf lbzip2*
|
17
|
+
|
18
|
+
WORKDIR /
|
19
|
+
ENV PATH $PATH:/wp2txt/bin
|
20
|
+
CMD ["bash"]
|
data/Gemfile
CHANGED
@@ -1,4 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
source "http://rubygems.org"
|
2
4
|
|
3
|
-
|
4
|
-
|
5
|
+
gem "htmlentities"
|
6
|
+
gem "nokogiri"
|
7
|
+
gem "optimist"
|
8
|
+
gem "parallel"
|
9
|
+
gem "pastel"
|
10
|
+
gem "ruby-progressbar"
|
11
|
+
gem "tty-spinner"
|
data/README.md
CHANGED
@@ -8,6 +8,15 @@ WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML
|
|
8
8
|
|
9
9
|
## Changelog
|
10
10
|
|
11
|
+
**January 2023**
|
12
|
+
|
13
|
+
- Bug related to command line arguments fixed
|
14
|
+
- Code cleanup introducing Rubocop
|
15
|
+
|
16
|
+
**December 2022**
|
17
|
+
|
18
|
+
- Docker images available via Docker Hub
|
19
|
+
|
11
20
|
**November 2022**
|
12
21
|
|
13
22
|
- Code added to suppress "Invalid byte sequence error" when an ilegal UTF-8 character is input.
|
@@ -38,9 +47,28 @@ In the above environment, the process (decompression, splitting, extraction, and
|
|
38
47
|
- Allows extracting category information of the article
|
39
48
|
- Allows extracting opening paragraphs of the article
|
40
49
|
|
41
|
-
##
|
50
|
+
## Setting Up
|
51
|
+
|
52
|
+
### WP2TXT on Docker
|
53
|
+
|
54
|
+
1. Install [Docker Desktop](https://www.docker.com/products/docker-desktop/) (Mac/Windows/Linux)
|
55
|
+
2. Execute `docker` command in a terminal:
|
56
|
+
|
57
|
+
```shell
|
58
|
+
docker run -it -v /Users/me/localdata:/data yohasebe/wp2txt
|
59
|
+
```
|
60
|
+
|
61
|
+
- Make sure to Replace `/Users/me/localdata` with the full path to the data directory in your local computer
|
62
|
+
|
63
|
+
3. The Docker image will begin downloading and a bash prompt will appear when finished.
|
64
|
+
4. The `wp2txt` command will be avalable anywhare in the Docker container. Use the `/data` directory as the location of the input dump files and the output text files.
|
65
|
+
|
66
|
+
**IMPORTANT:**
|
67
|
+
|
68
|
+
- Configure Docker Desktop resource settings (number of cores, amount of memory, etc.) to get the best performance possible.
|
69
|
+
- When running the `wp2txt` command inside a Docker container, be sure to set the output directory to somewhere in the mounted local directory specified by the `docker run` command.
|
42
70
|
|
43
|
-
###
|
71
|
+
### WP2TXT on MacOS and Linux
|
44
72
|
|
45
73
|
WP2TXT requires that one of the following commands be installed on the system in order to decompress `bz2` files:
|
46
74
|
|
@@ -54,7 +82,7 @@ If you are using MacOS with Homebrew installed, you can install `lbzip2` with th
|
|
54
82
|
|
55
83
|
$ brew install lbzip2
|
56
84
|
|
57
|
-
###
|
85
|
+
### WP2TXT on Windows
|
58
86
|
|
59
87
|
Install [Bzip2 for Windows](http://gnuwin32.sourceforge.net/packages/bzip2.htm) and set the path so that WP2TXT can use the bunzip2.exe command. Alternatively, you can extract the Wikipedia dump file in your own way and process the resulting XML file with WP2TXT.
|
60
88
|
|
@@ -70,7 +98,7 @@ Download the latest Wikipedia dump file for the desired language at a URL such a
|
|
70
98
|
|
71
99
|
https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
|
72
100
|
|
73
|
-
Here, `enwiki` refers to the English Wikipedia. To get the Japanese Wikipedia dump file, for instance, change this to jawiki (Japanese). In doing so, note that there are two instances of `enwiki` in the URL above.
|
101
|
+
Here, `enwiki` refers to the English Wikipedia. To get the Japanese Wikipedia dump file, for instance, change this to `jawiki` (Japanese). In doing so, note that there are two instances of `enwiki` in the URL above.
|
74
102
|
|
75
103
|
Alternatively, you can also select Wikipedia dump files created on a specific date from [here](http://dumps.wikimedia.org/backup-index.html). Make sure to download a file named in the following format:
|
76
104
|
|
@@ -190,11 +218,11 @@ The author will appreciate your mentioning one of these in your research.
|
|
190
218
|
Or use this BibTeX entry:
|
191
219
|
|
192
220
|
```
|
193
|
-
@misc{
|
221
|
+
@misc{wp2txt_2023,
|
194
222
|
author = {Yoichiro Hasebe},
|
195
223
|
title = {WP2TXT: A command-line toolkit to extract text content and category data from Wikipedia dump files},
|
196
224
|
url = {https://github.com/yohasebe/wp2txt},
|
197
|
-
year = {
|
225
|
+
year = {2023}
|
198
226
|
}
|
199
227
|
```
|
200
228
|
|
data/Rakefile
CHANGED
@@ -1,9 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require "bundler/gem_tasks"
|
2
|
-
require
|
3
|
-
require
|
4
|
+
require "rspec/core"
|
5
|
+
require "rspec/core/rake_task"
|
6
|
+
require_relative "./lib/wp2txt/version"
|
7
|
+
|
8
|
+
class String
|
9
|
+
def strip_heredoc
|
10
|
+
gsub(/^#{scan(/^[ \t]*(?=\S)/).min}/, "")
|
11
|
+
end
|
12
|
+
end
|
4
13
|
|
5
14
|
RSpec::Core::RakeTask.new(:spec) do |spec|
|
6
|
-
spec.pattern = FileList[
|
15
|
+
spec.pattern = FileList["spec/**/*_spec.rb"]
|
7
16
|
end
|
8
17
|
|
9
|
-
task :
|
18
|
+
task default: :spec
|
19
|
+
|
20
|
+
desc "Push Docker images"
|
21
|
+
task :push do
|
22
|
+
sh <<-SCRIPT.strip_heredoc, { verbose: false }
|
23
|
+
/bin/bash -xeu <<'BASH'
|
24
|
+
# docker buildx create --name mybuilder
|
25
|
+
# docker buildx use mybuilder
|
26
|
+
# docker buildx inspect --bootstrap
|
27
|
+
docker buildx build --platform linux/amd64,linux/arm64 -t yohasebe/wp2txt:#{Wp2txt::VERSION} -t yohasebe/wp2txt:latest . --push
|
28
|
+
BASH
|
29
|
+
SCRIPT
|
30
|
+
end
|