wp2txt 1.0.2 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bb540f4f17f7825786d110245c235ac556e3e64cedb17efae3e0591887425801
4
- data.tar.gz: 479c357f7ba117ae10d9a5a04d24ce3aca2e54d942a156b02eb932c1aab55c8b
3
+ metadata.gz: 0bcba2c84286504ae628176aad55dbdea05889dfaa7f471cf080ae933691cffc
4
+ data.tar.gz: f83c63d7c6e91270da1da2aed54ab6e5c352c5695340ccd6378fdd20c43fc332
5
5
  SHA512:
6
- metadata.gz: 940d47d2c8bce06029fe76e3b3744563d089e26e297e5224b36e65d815295da57117eae84cbb43abeddf2f2c052e2a987d668cba52c7af6148e935b571b6d403
7
- data.tar.gz: 8ce76523a3bf181ac7a5da11f088dd14cfb1e1d7ac0d5239832db52968d183db16a3ece6074513b634eebe0e5ca28ceea945eaef6542ecb1933266caf4e89a3c
6
+ metadata.gz: ae0eae028a98d4299a0e93278220b991e53e13deb80b88cce2971cd889d769808305e0d3aa8ee4e73af0cc55f07f27c0cc6a9f0d440e4693a410ba7d0a6333ba
7
+ data.tar.gz: 25bff247bf80b4a0b5ff785ed51b60e3e21e6a3ca5e0bfeace9961df5060832ae047c3fa606a3267f9f770c253fd816176372c4d2f7ba73999d5317ea59933e6
data/.dockerignore ADDED
@@ -0,0 +1,8 @@
1
+ .git
2
+ .github
3
+ image
4
+ pkg
5
+ spec
6
+ .dockerignore
7
+ .gitignore
8
+ Gemfile.lock
@@ -0,0 +1,36 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+ workflow_dispatch:
7
+ schedule:
8
+ - cron: '42 5 * * *'
9
+
10
+ jobs:
11
+ test:
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ ruby: [ '3.1' ]
16
+
17
+ runs-on: ubuntu-latest
18
+ name: Ruby ${{matrix.ruby}}
19
+ container: ruby:${{matrix.ruby}}
20
+
21
+ steps:
22
+ - uses: actions/checkout@v3
23
+
24
+ - name: Show Ruby Version
25
+ run: ruby -v
26
+
27
+ - name: Install dependencies
28
+ run: bundle install
29
+
30
+ - name: Install rspec
31
+ run: gem install rspec
32
+
33
+ - name: Run tests
34
+ run: rspec
35
+
36
+
data/.gitignore CHANGED
@@ -18,4 +18,4 @@ tmp
18
18
  .DS_Store
19
19
  *.bak
20
20
  *.~
21
-
21
+ tags
data/.rubocop.yml ADDED
@@ -0,0 +1,80 @@
1
+ AllCops:
2
+ NewCops: disable
3
+ SuggestExtensions: false
4
+ TargetRubyVersion: 2.6
5
+
6
+ Documentation:
7
+ Enabled: false
8
+
9
+ Naming/AccessorMethodName:
10
+ Enabled: false
11
+
12
+ Naming/VariableNumber:
13
+ Enabled: false
14
+
15
+ Naming/FileName:
16
+ Enabled: false
17
+
18
+ Security/MarshalLoad:
19
+ Enabled: false
20
+
21
+ Security/Open:
22
+ Enabled: false
23
+
24
+ Layout/EndOfLine:
25
+ Enabled: False
26
+
27
+ Style/FormatStringToken:
28
+ Enabled: false
29
+
30
+ Style/ClassVars:
31
+ Enabled: false
32
+
33
+ Style/OptionalBooleanParameter:
34
+ Enabled: false
35
+
36
+ Style/StringConcatenation:
37
+ Enabled: false
38
+
39
+ Style/PerlBackrefs:
40
+ Enabled: false
41
+
42
+ Style/StringLiterals:
43
+ Enabled: false
44
+
45
+ Style/StringLiteralsInInterpolation:
46
+ Enabled: true
47
+ EnforcedStyle: double_quotes
48
+
49
+ Style/WordArray:
50
+ Enabled: false
51
+
52
+ Style/EvalWithLocation:
53
+ Enabled: false
54
+
55
+ Layout/LineLength:
56
+ Max: 400
57
+
58
+ Metrics/MethodLength:
59
+ Max: 200
60
+
61
+ Metrics/BlockLength:
62
+ Max: 200
63
+
64
+ Metrics/AbcSize:
65
+ Max: 200
66
+
67
+ Metrics/PerceivedComplexity:
68
+ Max: 60
69
+
70
+ Metrics/ClassLength:
71
+ Max: 800
72
+
73
+ Metrics/CyclomaticComplexity:
74
+ Max: 60
75
+
76
+ Metrics/ParameterLists:
77
+ Max: 8
78
+
79
+ Metrics/ModuleLength:
80
+ Max: 600
data/.solargraph.yml ADDED
@@ -0,0 +1,22 @@
1
+ ---
2
+ include:
3
+ - "**/*.rb"
4
+ exclude:
5
+ - spec/**/*
6
+ - test/**/*
7
+ - vendor/**/*
8
+ - ".bundle/**/*"
9
+ require: []
10
+ domains: []
11
+ reporters:
12
+ - rubocop
13
+ # - require_not_found
14
+ formatter:
15
+ rubocop:
16
+ cops: safe
17
+ except: []
18
+ only: []
19
+ extra_args: []
20
+ require_paths: []
21
+ plugins: []
22
+ max_files: 5000
data/Dockerfile ADDED
@@ -0,0 +1,20 @@
1
+ FROM ruby:3.1.3-alpine3.17
2
+
3
+ WORKDIR /wp2txt
4
+ COPY . ./
5
+ RUN rm -Rf wp2txt/Gemfile.lock
6
+
7
+ RUN apk update && \
8
+ apk upgrade && \
9
+ apk add --no-cache linux-headers libxml2-dev make gcc libc-dev bash && \
10
+ apk add --no-cache -t .build-packages --no-cache build-base curl-dev wget gcompat && \
11
+ bundle install -j4
12
+
13
+ RUN wget https://fossies.org/linux/privat/lbzip2-2.5.tar.gz -O lbzip2.tar.gz && \
14
+ tar -xvf lbzip2.tar.gz && cd lbzip2-2.5 && \
15
+ bash configure && make && make install && \
16
+ cd .. && rm -rf lbzip2*
17
+
18
+ WORKDIR /
19
+ ENV PATH $PATH:/wp2txt/bin
20
+ CMD ["bash"]
data/Gemfile CHANGED
@@ -1,4 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  source "http://rubygems.org"
2
4
 
3
- # Specify your gem's dependencies in wp2txt.gemspec
4
- gemspec
5
+ gem "htmlentities"
6
+ gem "nokogiri"
7
+ gem "optimist"
8
+ gem "parallel"
9
+ gem "pastel"
10
+ gem "ruby-progressbar"
11
+ gem "tty-spinner"
data/README.md CHANGED
@@ -8,6 +8,15 @@ WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML
8
8
 
9
9
  ## Changelog
10
10
 
11
+ **January 2023**
12
+
13
+ - Bug related to command line arguments fixed
14
+ - Code cleanup introducing Rubocop
15
+
16
+ **December 2022**
17
+
18
+ - Docker images available via Docker Hub
19
+
11
20
  **November 2022**
12
21
 
13
22
  - Code added to suppress "Invalid byte sequence error" when an ilegal UTF-8 character is input.
@@ -38,9 +47,28 @@ In the above environment, the process (decompression, splitting, extraction, and
38
47
  - Allows extracting category information of the article
39
48
  - Allows extracting opening paragraphs of the article
40
49
 
41
- ## Preparation
50
+ ## Setting Up
51
+
52
+ ### WP2TXT on Docker
53
+
54
+ 1. Install [Docker Desktop](https://www.docker.com/products/docker-desktop/) (Mac/Windows/Linux)
55
+ 2. Execute `docker` command in a terminal:
56
+
57
+ ```shell
58
+ docker run -it -v /Users/me/localdata:/data yohasebe/wp2txt
59
+ ```
60
+
61
+ - Make sure to Replace `/Users/me/localdata` with the full path to the data directory in your local computer
62
+
63
+ 3. The Docker image will begin downloading and a bash prompt will appear when finished.
64
+ 4. The `wp2txt` command will be avalable anywhare in the Docker container. Use the `/data` directory as the location of the input dump files and the output text files.
65
+
66
+ **IMPORTANT:**
67
+
68
+ - Configure Docker Desktop resource settings (number of cores, amount of memory, etc.) to get the best performance possible.
69
+ - When running the `wp2txt` command inside a Docker container, be sure to set the output directory to somewhere in the mounted local directory specified by the `docker run` command.
42
70
 
43
- ### For MacOS and Linux
71
+ ### WP2TXT on MacOS and Linux
44
72
 
45
73
  WP2TXT requires that one of the following commands be installed on the system in order to decompress `bz2` files:
46
74
 
@@ -54,7 +82,7 @@ If you are using MacOS with Homebrew installed, you can install `lbzip2` with th
54
82
 
55
83
  $ brew install lbzip2
56
84
 
57
- ### For Windows
85
+ ### WP2TXT on Windows
58
86
 
59
87
  Install [Bzip2 for Windows](http://gnuwin32.sourceforge.net/packages/bzip2.htm) and set the path so that WP2TXT can use the bunzip2.exe command. Alternatively, you can extract the Wikipedia dump file in your own way and process the resulting XML file with WP2TXT.
60
88
 
@@ -70,7 +98,7 @@ Download the latest Wikipedia dump file for the desired language at a URL such a
70
98
 
71
99
  https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
72
100
 
73
- Here, `enwiki` refers to the English Wikipedia. To get the Japanese Wikipedia dump file, for instance, change this to jawiki (Japanese). In doing so, note that there are two instances of `enwiki` in the URL above.
101
+ Here, `enwiki` refers to the English Wikipedia. To get the Japanese Wikipedia dump file, for instance, change this to `jawiki` (Japanese). In doing so, note that there are two instances of `enwiki` in the URL above.
74
102
 
75
103
  Alternatively, you can also select Wikipedia dump files created on a specific date from [here](http://dumps.wikimedia.org/backup-index.html). Make sure to download a file named in the following format:
76
104
 
@@ -190,11 +218,11 @@ The author will appreciate your mentioning one of these in your research.
190
218
  Or use this BibTeX entry:
191
219
 
192
220
  ```
193
- @misc{wp2txt_2022,
221
+ @misc{wp2txt_2023,
194
222
  author = {Yoichiro Hasebe},
195
223
  title = {WP2TXT: A command-line toolkit to extract text content and category data from Wikipedia dump files},
196
224
  url = {https://github.com/yohasebe/wp2txt},
197
- year = {2022}
225
+ year = {2023}
198
226
  }
199
227
  ```
200
228
 
data/Rakefile CHANGED
@@ -1,9 +1,30 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require "bundler/gem_tasks"
2
- require 'rspec/core'
3
- require 'rspec/core/rake_task'
4
+ require "rspec/core"
5
+ require "rspec/core/rake_task"
6
+ require_relative "./lib/wp2txt/version"
7
+
8
+ class String
9
+ def strip_heredoc
10
+ gsub(/^#{scan(/^[ \t]*(?=\S)/).min}/, "")
11
+ end
12
+ end
4
13
 
5
14
  RSpec::Core::RakeTask.new(:spec) do |spec|
6
- spec.pattern = FileList['spec/**/*_spec.rb']
15
+ spec.pattern = FileList["spec/**/*_spec.rb"]
7
16
  end
8
17
 
9
- task :default => :spec
18
+ task default: :spec
19
+
20
+ desc "Push Docker images"
21
+ task :push do
22
+ sh <<-SCRIPT.strip_heredoc, { verbose: false }
23
+ /bin/bash -xeu <<'BASH'
24
+ # docker buildx create --name mybuilder
25
+ # docker buildx use mybuilder
26
+ # docker buildx inspect --bootstrap
27
+ docker buildx build --platform linux/amd64,linux/arm64 -t yohasebe/wp2txt:#{Wp2txt::VERSION} -t yohasebe/wp2txt:latest . --push
28
+ BASH
29
+ SCRIPT
30
+ end