wp2txt 1.0.2 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.dockerignore +8 -0
- data/.github/workflows/ci.yml +36 -0
- data/.gitignore +1 -1
- data/.rubocop.yml +80 -0
- data/.solargraph.yml +22 -0
- data/Dockerfile +20 -0
- data/Gemfile +9 -2
- data/README.md +26 -3
- data/Rakefile +25 -4
- data/bin/wp2txt +177 -182
- data/lib/wp2txt/article.rb +70 -92
- data/lib/wp2txt/regex.rb +93 -0
- data/lib/wp2txt/utils.rb +159 -270
- data/lib/wp2txt/version.rb +3 -1
- data/lib/wp2txt.rb +119 -150
- data/spec/spec_helper.rb +4 -4
- data/spec/utils_spec.rb +101 -124
- data/wp2txt.gemspec +16 -18
- metadata +60 -12
- data/tags +0 -58
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62f1e8d6ab1932f3ae3c34fb71930b7e73500c832481dcea6288742c38850a79
|
4
|
+
data.tar.gz: f0ff0a5488b635b828338d41029c5ad191a0c88282c0fa294a9facf2d93c055b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7bca85758e88d53dcd33fe43e83a251624f89329a2cb55ffb97b41141bcf8fe5ace7c48e3b8e49f5aa42f84724247cfe4ad376238a949e9154876d4d07469afe
|
7
|
+
data.tar.gz: de59399d5163afed2947e0802abf2e0365894d566c8a1f11823bc901d4948346e7af47d6fba558387f5af7e1301a6725a51a322ac1cd4810264dc3003e0729e2
|
data/.dockerignore
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
pull_request:
|
6
|
+
workflow_dispatch:
|
7
|
+
schedule:
|
8
|
+
- cron: '42 5 * * *'
|
9
|
+
|
10
|
+
jobs:
|
11
|
+
test:
|
12
|
+
strategy:
|
13
|
+
fail-fast: false
|
14
|
+
matrix:
|
15
|
+
ruby: [ '3.1' ]
|
16
|
+
|
17
|
+
runs-on: ubuntu-latest
|
18
|
+
name: Ruby ${{matrix.ruby}}
|
19
|
+
container: ruby:${{matrix.ruby}}
|
20
|
+
|
21
|
+
steps:
|
22
|
+
- uses: actions/checkout@v3
|
23
|
+
|
24
|
+
- name: Show Ruby Version
|
25
|
+
run: ruby -v
|
26
|
+
|
27
|
+
- name: Install dependencies
|
28
|
+
run: bundle install
|
29
|
+
|
30
|
+
- name: Install rspec
|
31
|
+
run: gem install rspec
|
32
|
+
|
33
|
+
- name: Run tests
|
34
|
+
run: rspec
|
35
|
+
|
36
|
+
|
data/.gitignore
CHANGED
data/.rubocop.yml
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
AllCops:
|
2
|
+
NewCops: disable
|
3
|
+
SuggestExtensions: false
|
4
|
+
TargetRubyVersion: 2.6
|
5
|
+
|
6
|
+
Documentation:
|
7
|
+
Enabled: false
|
8
|
+
|
9
|
+
Naming/AccessorMethodName:
|
10
|
+
Enabled: false
|
11
|
+
|
12
|
+
Naming/VariableNumber:
|
13
|
+
Enabled: false
|
14
|
+
|
15
|
+
Naming/FileName:
|
16
|
+
Enabled: false
|
17
|
+
|
18
|
+
Security/MarshalLoad:
|
19
|
+
Enabled: false
|
20
|
+
|
21
|
+
Security/Open:
|
22
|
+
Enabled: false
|
23
|
+
|
24
|
+
Layout/EndOfLine:
|
25
|
+
Enabled: False
|
26
|
+
|
27
|
+
Style/FormatStringToken:
|
28
|
+
Enabled: false
|
29
|
+
|
30
|
+
Style/ClassVars:
|
31
|
+
Enabled: false
|
32
|
+
|
33
|
+
Style/OptionalBooleanParameter:
|
34
|
+
Enabled: false
|
35
|
+
|
36
|
+
Style/StringConcatenation:
|
37
|
+
Enabled: false
|
38
|
+
|
39
|
+
Style/PerlBackrefs:
|
40
|
+
Enabled: false
|
41
|
+
|
42
|
+
Style/StringLiterals:
|
43
|
+
Enabled: false
|
44
|
+
|
45
|
+
Style/StringLiteralsInInterpolation:
|
46
|
+
Enabled: true
|
47
|
+
EnforcedStyle: double_quotes
|
48
|
+
|
49
|
+
Style/WordArray:
|
50
|
+
Enabled: false
|
51
|
+
|
52
|
+
Style/EvalWithLocation:
|
53
|
+
Enabled: false
|
54
|
+
|
55
|
+
Layout/LineLength:
|
56
|
+
Max: 400
|
57
|
+
|
58
|
+
Metrics/MethodLength:
|
59
|
+
Max: 200
|
60
|
+
|
61
|
+
Metrics/BlockLength:
|
62
|
+
Max: 200
|
63
|
+
|
64
|
+
Metrics/AbcSize:
|
65
|
+
Max: 200
|
66
|
+
|
67
|
+
Metrics/PerceivedComplexity:
|
68
|
+
Max: 60
|
69
|
+
|
70
|
+
Metrics/ClassLength:
|
71
|
+
Max: 800
|
72
|
+
|
73
|
+
Metrics/CyclomaticComplexity:
|
74
|
+
Max: 60
|
75
|
+
|
76
|
+
Metrics/ParameterLists:
|
77
|
+
Max: 8
|
78
|
+
|
79
|
+
Metrics/ModuleLength:
|
80
|
+
Max: 600
|
data/.solargraph.yml
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
---
|
2
|
+
include:
|
3
|
+
- "**/*.rb"
|
4
|
+
exclude:
|
5
|
+
- spec/**/*
|
6
|
+
- test/**/*
|
7
|
+
- vendor/**/*
|
8
|
+
- ".bundle/**/*"
|
9
|
+
require: []
|
10
|
+
domains: []
|
11
|
+
reporters:
|
12
|
+
- rubocop
|
13
|
+
# - require_not_found
|
14
|
+
formatter:
|
15
|
+
rubocop:
|
16
|
+
cops: safe
|
17
|
+
except: []
|
18
|
+
only: []
|
19
|
+
extra_args: []
|
20
|
+
require_paths: []
|
21
|
+
plugins: []
|
22
|
+
max_files: 5000
|
data/Dockerfile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
FROM ruby:3.1.3-alpine3.17
|
2
|
+
|
3
|
+
WORKDIR /wp2txt
|
4
|
+
COPY . ./
|
5
|
+
RUN rm -Rf wp2txt/Gemfile.lock
|
6
|
+
|
7
|
+
RUN apk update && \
|
8
|
+
apk upgrade && \
|
9
|
+
apk add --no-cache linux-headers libxml2-dev make gcc libc-dev bash && \
|
10
|
+
apk add --no-cache -t .build-packages --no-cache build-base curl-dev wget gcompat && \
|
11
|
+
bundle install -j4
|
12
|
+
|
13
|
+
RUN wget https://fossies.org/linux/privat/lbzip2-2.5.tar.gz -O lbzip2.tar.gz && \
|
14
|
+
tar -xvf lbzip2.tar.gz && cd lbzip2-2.5 && \
|
15
|
+
bash configure && make && make install && \
|
16
|
+
cd .. && rm -rf lbzip2*
|
17
|
+
|
18
|
+
WORKDIR /
|
19
|
+
ENV PATH $PATH:/wp2txt/bin
|
20
|
+
CMD ["bash"]
|
data/Gemfile
CHANGED
@@ -1,4 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
source "http://rubygems.org"
|
2
4
|
|
3
|
-
|
4
|
-
|
5
|
+
gem "htmlentities"
|
6
|
+
gem "nokogiri"
|
7
|
+
gem "optimist"
|
8
|
+
gem "parallel"
|
9
|
+
gem "pastel"
|
10
|
+
gem "ruby-progressbar"
|
11
|
+
gem "tty-spinner"
|
data/README.md
CHANGED
@@ -8,6 +8,10 @@ WP2TXT extracts text and category data from Wikipedia dump files (encoded in XML
|
|
8
8
|
|
9
9
|
## Changelog
|
10
10
|
|
11
|
+
**December 2022**
|
12
|
+
|
13
|
+
- Docker images available via Docker Hub
|
14
|
+
|
11
15
|
**November 2022**
|
12
16
|
|
13
17
|
- Code added to suppress "Invalid byte sequence error" when an ilegal UTF-8 character is input.
|
@@ -38,9 +42,28 @@ In the above environment, the process (decompression, splitting, extraction, and
|
|
38
42
|
- Allows extracting category information of the article
|
39
43
|
- Allows extracting opening paragraphs of the article
|
40
44
|
|
41
|
-
##
|
45
|
+
## Setting Up
|
46
|
+
|
47
|
+
### WP2TXT on Docker
|
48
|
+
|
49
|
+
1. Install [Docker Desktop](https://www.docker.com/products/docker-desktop/) (Mac/Windows/Linux)
|
50
|
+
2. Execute `docker` command in a terminal:
|
51
|
+
|
52
|
+
```shell
|
53
|
+
docker run -it -v /Users/me/localdata:/data yohasebe/wp2txt
|
54
|
+
```
|
55
|
+
|
56
|
+
- Make sure to Replace `/Users/me/localdata` with the full path to the data directory in your local computer
|
57
|
+
|
58
|
+
3. The Docker image will begin downloading and a bash prompt will appear when finished.
|
59
|
+
4. The `wp2txt` command will be avalable anywhare in the Docker container. Use the `/data` directory as the location of the input dump files and the output text files.
|
60
|
+
|
61
|
+
**IMPORTANT:**
|
62
|
+
|
63
|
+
- Configure Docker Desktop resource settings (number of cores, amount of memory, etc.) to get the best performance possible.
|
64
|
+
- When running the `wp2txt` command inside a Docker container, be sure to set the output directory to somewhere in the mounted local directory specified by the `docker run` command.
|
42
65
|
|
43
|
-
###
|
66
|
+
### WP2TXT on MacOS and Linux
|
44
67
|
|
45
68
|
WP2TXT requires that one of the following commands be installed on the system in order to decompress `bz2` files:
|
46
69
|
|
@@ -54,7 +77,7 @@ If you are using MacOS with Homebrew installed, you can install `lbzip2` with th
|
|
54
77
|
|
55
78
|
$ brew install lbzip2
|
56
79
|
|
57
|
-
###
|
80
|
+
### WP2TXT on Windows
|
58
81
|
|
59
82
|
Install [Bzip2 for Windows](http://gnuwin32.sourceforge.net/packages/bzip2.htm) and set the path so that WP2TXT can use the bunzip2.exe command. Alternatively, you can extract the Wikipedia dump file in your own way and process the resulting XML file with WP2TXT.
|
60
83
|
|
data/Rakefile
CHANGED
@@ -1,9 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require "bundler/gem_tasks"
|
2
|
-
require
|
3
|
-
require
|
4
|
+
require "rspec/core"
|
5
|
+
require "rspec/core/rake_task"
|
6
|
+
require_relative "./lib/wp2txt/version"
|
7
|
+
|
8
|
+
class String
|
9
|
+
def strip_heredoc
|
10
|
+
gsub(/^#{scan(/^[ \t]*(?=\S)/).min}/, "")
|
11
|
+
end
|
12
|
+
end
|
4
13
|
|
5
14
|
RSpec::Core::RakeTask.new(:spec) do |spec|
|
6
|
-
spec.pattern = FileList[
|
15
|
+
spec.pattern = FileList["spec/**/*_spec.rb"]
|
7
16
|
end
|
8
17
|
|
9
|
-
task :
|
18
|
+
task default: :spec
|
19
|
+
|
20
|
+
desc "Push Docker images"
|
21
|
+
task :push do
|
22
|
+
sh <<-SCRIPT.strip_heredoc, { verbose: false }
|
23
|
+
/bin/bash -xeu <<'BASH'
|
24
|
+
# docker buildx create --name mybuilder
|
25
|
+
# docker buildx use mybuilder
|
26
|
+
# docker buildx inspect --bootstrap
|
27
|
+
docker buildx build --platform linux/amd64,linux/arm64 -t yohasebe/wp2txt:#{Wp2txt::VERSION} -t yohasebe/wp2txt:latest . --push
|
28
|
+
BASH
|
29
|
+
SCRIPT
|
30
|
+
end
|
data/bin/wp2txt
CHANGED
@@ -1,197 +1,192 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
|
4
|
-
$: << File.join(File.dirname(__FILE__))
|
5
|
-
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
6
|
-
|
7
|
-
$DEBUG_MODE = false
|
8
|
-
SHAREDIR = File.join(File.dirname(__FILE__), '..', 'share')
|
9
|
-
DOCDIR = File.join(File.dirname(__FILE__), '..', 'doc')
|
10
|
-
|
11
|
-
require 'wp2txt'
|
12
|
-
require 'wp2txt/utils'
|
13
|
-
require 'wp2txt/version'
|
14
|
-
require 'etc'
|
15
|
-
require 'optimist'
|
16
|
-
require 'parallel'
|
17
|
-
require 'pastel'
|
18
|
-
require 'tty-spinner'
|
19
|
-
|
20
|
-
include Wp2txt
|
21
|
-
|
22
|
-
opts = Optimist::options do
|
23
|
-
version Wp2txt::VERSION
|
24
|
-
banner <<-EOS
|
25
|
-
WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
|
26
|
-
|
27
|
-
Usage: wp2txt [options]
|
28
|
-
where [options] are:
|
29
|
-
EOS
|
30
|
-
|
31
|
-
opt :input, "Path to compressed file (bz2) or decompressed file (xml), or path to directory containing files of the latter format", :required => true, :short => "-i"
|
32
|
-
opt :output_dir, "Path to output directory", :default => Dir::pwd, :type => String, :short => "-o"
|
33
|
-
opt :convert, "Output in plain text (converting from XML)", :default => true, :short => "-c"
|
34
|
-
opt :category, "Show article category information", :default => true, :short => "-a"
|
35
|
-
opt :category_only, "Extract only article title and categories", :default => false, :short => "-g"
|
36
|
-
opt :summary_only, "Extract only article title, categories, and summary text before first heading", :default => false, :short => "-s"
|
37
|
-
opt :file_size, "Approximate size (in MB) of each output file", :default => 10, :short => "-f"
|
38
|
-
opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", :short => "-n"
|
39
|
-
opt :del_interfile, "Delete intermediate XML files from output dir", :short => "-x", :default => false
|
40
|
-
opt :title, "Keep page titles in output", :default => true, :short => "-t"
|
41
|
-
opt :heading, "Keep section titles in output", :default => true, :short => "-d"
|
42
|
-
opt :list, "Keep unprocessed list items in output", :default => false, :short => "-l"
|
43
|
-
opt :ref, "Keep reference notations in the format [ref]...[/ref]", :default => false, :short => "-r"
|
44
|
-
opt :redirect, "Show redirect destination", :default => false, :short => "-e"
|
45
|
-
opt :marker, "Show symbols prefixed to list items, definitions, etc.", :default => true, :short => "-m"
|
46
|
-
opt :bz2_gem, "Use Ruby's bzip2-ruby gem instead of a system command", :default => false, :short => "-b"
|
47
|
-
end
|
48
|
-
|
49
|
-
Optimist::die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
50
|
-
Optimist::die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
|
51
|
-
|
52
|
-
pastel = Pastel.new
|
53
2
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
DEBUG_MODE = false
|
6
|
+
SHAREDIR = File.join(File.dirname(__FILE__), "..", "share")
|
7
|
+
DOCDIR = File.join(File.dirname(__FILE__), "..", "doc")
|
8
|
+
|
9
|
+
require_relative "../lib/wp2txt"
|
10
|
+
require_relative "../lib/wp2txt/utils"
|
11
|
+
require_relative "../lib/wp2txt/version"
|
12
|
+
|
13
|
+
require "etc"
|
14
|
+
require "optimist"
|
15
|
+
require "parallel"
|
16
|
+
require "pastel"
|
17
|
+
require "tty-spinner"
|
18
|
+
|
19
|
+
class WpApp
|
20
|
+
include Wp2txt
|
21
|
+
|
22
|
+
def run
|
23
|
+
opts = Optimist.options do
|
24
|
+
version VERSION
|
25
|
+
banner <<~BANNER
|
26
|
+
WP2TXT extracts plain text data from Wikipedia dump file (encoded in XML/compressed with Bzip2) stripping all the MediaWiki markups and other metadata.
|
27
|
+
|
28
|
+
Usage: wp2txt [options]
|
29
|
+
where [options] are:
|
30
|
+
BANNER
|
31
|
+
|
32
|
+
opt :input, "Path to compressed file (bz2) or decompressed file (xml), or path to directory containing files of the latter format", type: String, required: true, short: "-i"
|
33
|
+
opt :output_dir, "Path to output directory", default: Dir.pwd, type: String, short: "-o"
|
34
|
+
opt :convert, "Output in plain text (converting from XML)", default: true, short: "-c"
|
35
|
+
opt :category, "Show article category information", default: true, short: "-a"
|
36
|
+
opt :category_only, "Extract only article title and categories", default: false, short: "-g"
|
37
|
+
opt :summary_only, "Extract only article title, categories, and summary text before first heading", default: false, short: "-s"
|
38
|
+
opt :file_size, "Approximate size (in MB) of each output file", default: 10, short: "-f"
|
39
|
+
opt :num_procs, "Number of proccesses to be run concurrently (default: max num of CPU cores minus two)", short: "-n"
|
40
|
+
opt :del_interfile, "Delete intermediate XML files from output dir", short: "-x", default: false
|
41
|
+
opt :title, "Keep page titles in output", default: true, short: "-t"
|
42
|
+
opt :heading, "Keep section titles in output", default: true, short: "-d"
|
43
|
+
opt :list, "Keep unprocessed list items in output", default: false, short: "-l"
|
44
|
+
opt :ref, "Keep reference notations in the format [ref]...[/ref]", default: false, short: "-r"
|
45
|
+
opt :redirect, "Show redirect destination", default: false, short: "-e"
|
46
|
+
opt :marker, "Show symbols prefixed to list items, definitions, etc.", default: true, short: "-m"
|
47
|
+
opt :bz2_gem, "Use Ruby's bzip2-ruby gem instead of a system command", default: false, short: "-b"
|
48
|
+
end
|
86
49
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
50
|
+
Optimist.die :size, "must be larger than 0" unless opts[:file_size] >= 0
|
51
|
+
Optimist.die :input, "must exist" unless File.exist?(opts[:input])
|
52
|
+
Optimist.die :output_dir, "must exist" unless File.exist?(opts[:output_dir])
|
53
|
+
|
54
|
+
pastel = Pastel.new
|
55
|
+
|
56
|
+
input_file = opts[:input]
|
57
|
+
output_dir = opts[:output_dir]
|
58
|
+
tfile_size = opts[:file_size]
|
59
|
+
num_processors = Etc.nprocessors
|
60
|
+
num_processes = if opts[:num_procs] && opts[:num_procs].to_i <= num_processors
|
61
|
+
opts[:num_procs]
|
62
|
+
else
|
63
|
+
num_processors - 2
|
64
|
+
end
|
65
|
+
num_processes = 1 if num_processes < 1
|
66
|
+
|
67
|
+
convert = opts[:convert]
|
68
|
+
strip_tmarker = opts[:marker] ? false : true
|
69
|
+
opt_array = %i[title list heading table redirect multiline category category_only summary_only del_interfile bz2_gem]
|
70
|
+
|
71
|
+
config = {}
|
72
|
+
opt_array.each do |opt|
|
73
|
+
config[opt] = opts[opt]
|
74
|
+
end
|
107
75
|
|
108
|
-
|
109
|
-
|
110
|
-
puts "Number of files being processed: " + pastel.bold("#{input_files.size}")
|
111
|
-
puts "Number of CPU cores being used: " + pastel.bold("#{num_processes}")
|
112
|
-
|
113
|
-
Parallel.map(input_files, progress: pastel.magenta.bold("WP2TXT"), in_processes: num_processes) do |input_file|
|
114
|
-
wpconv = Wp2txt::Runner.new(input_file, output_dir, strip_tmarker, config[:del_interfile])
|
115
|
-
wpconv.extract_text do |article|
|
116
|
-
format_wiki!(article.title)
|
117
|
-
|
118
|
-
if config[:category_only]
|
119
|
-
title = "#{article.title}\t"
|
120
|
-
contents = article.categories.join(", ")
|
121
|
-
contents << "\n"
|
122
|
-
elsif config[:category] && !article.categories.empty?
|
123
|
-
title = "\n[[#{article.title}]]\n\n"
|
124
|
-
contents = "\nCATEGORIES: "
|
125
|
-
contents << article.categories.join(", ")
|
126
|
-
contents << "\n\n"
|
76
|
+
if File.ftype(input_file) == "directory"
|
77
|
+
input_files = Dir.glob("#{input_file}/*.xml")
|
127
78
|
else
|
128
|
-
|
129
|
-
|
79
|
+
puts ""
|
80
|
+
puts pastel.green.bold("Preprocessing")
|
81
|
+
puts "Decompressing and splitting the original dump file."
|
82
|
+
puts pastel.underline("This may take a while. Please be patient!")
|
83
|
+
|
84
|
+
time_start = Time.now.to_i
|
85
|
+
wpsplitter = Splitter.new(input_file, output_dir, tfile_size)
|
86
|
+
spinner = TTY::Spinner.new(":spinner", format: :arrow_pulse, hide_cursor: true, interval: 5)
|
87
|
+
spinner.auto_spin
|
88
|
+
wpsplitter.split_file
|
89
|
+
time_finish = Time.now.to_i
|
90
|
+
|
91
|
+
spinner.stop("Time: #{sec_to_str(time_finish - time_start)}") # Stop animation
|
92
|
+
puts pastel.blue.bold("Complete!")
|
93
|
+
exit unless convert
|
94
|
+
input_files = Dir.glob("#{output_dir}/*.xml")
|
130
95
|
end
|
131
96
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
line = e.last
|
152
|
-
line << "+PRE+" if $DEBUG_MODE
|
153
|
-
when :mw_quote
|
154
|
-
line = e.last
|
155
|
-
line << "+QUOTE+" if $DEBUG_MODE
|
156
|
-
when :mw_unordered, :mw_ordered, :mw_definition
|
157
|
-
next if !config[:list]
|
158
|
-
line = e.last
|
159
|
-
line << "+LIST+" if $DEBUG_MODE
|
160
|
-
when :mw_ml_template
|
161
|
-
next if !config[:multiline]
|
162
|
-
line = e.last
|
163
|
-
line << "+MLTEMPLATE+" if $DEBUG_MODE
|
164
|
-
when :mw_redirect
|
165
|
-
next if !config[:redirect]
|
166
|
-
line = e.last
|
167
|
-
line << "+REDIRECT+" if $DEBUG_MODE
|
168
|
-
line << "\n\n"
|
169
|
-
when :mw_isolated_template
|
170
|
-
next if !config[:multiline]
|
171
|
-
line = e.last
|
172
|
-
line << "+ISOLATED_TEMPLATE+" if $DEBUG_MODE
|
173
|
-
when :mw_isolated_tag
|
174
|
-
next
|
97
|
+
puts ""
|
98
|
+
puts pastel.red.bold("Converting")
|
99
|
+
puts "Number of files being processed: " + pastel.bold(input_files.size.to_s)
|
100
|
+
puts "Number of CPU cores being used: " + pastel.bold(num_processes.to_s)
|
101
|
+
|
102
|
+
Parallel.map(input_files, progress: pastel.magenta.bold("WP2TXT"), in_processes: num_processes) do |infile|
|
103
|
+
wpconv = Runner.new(infile, output_dir, strip_tmarker, config[:del_interfile])
|
104
|
+
wpconv.extract_text do |article|
|
105
|
+
article.title = format_wiki(article.title, config)
|
106
|
+
|
107
|
+
if config[:category_only]
|
108
|
+
title = "#{article.title}\t"
|
109
|
+
contents = article.categories.join(", ")
|
110
|
+
contents << "\n"
|
111
|
+
elsif config[:category] && !article.categories.empty?
|
112
|
+
title = "\n[[#{article.title}]]\n\n"
|
113
|
+
contents = +"\nCATEGORIES: "
|
114
|
+
contents << article.categories.join(", ")
|
115
|
+
contents << "\n\n"
|
175
116
|
else
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
117
|
+
title = "\n[[#{article.title}]]\n\n"
|
118
|
+
contents = +""
|
119
|
+
end
|
120
|
+
|
121
|
+
unless config[:category_only]
|
122
|
+
article.elements.each do |e|
|
123
|
+
case e.first
|
124
|
+
when :mw_heading
|
125
|
+
break if config[:summary_only]
|
126
|
+
next unless config[:heading]
|
127
|
+
|
128
|
+
e[-1] = format_wiki(e.last, config)
|
129
|
+
line = e.last
|
130
|
+
line << "+HEADING+" if DEBUG_MODE
|
131
|
+
when :mw_paragraph
|
132
|
+
e[-1] = format_wiki(e.last, config)
|
133
|
+
line = e.last + "\n"
|
134
|
+
line << "+PARAGRAPH+" if DEBUG_MODE
|
135
|
+
when :mw_table, :mw_htable
|
136
|
+
next unless config[:table]
|
137
|
+
|
138
|
+
line = e.last
|
139
|
+
line << "+TABLE+" if DEBUG_MODE
|
140
|
+
when :mw_pre
|
141
|
+
next unless config[:pre]
|
142
|
+
|
143
|
+
line = e.last
|
144
|
+
line << "+PRE+" if DEBUG_MODE
|
145
|
+
when :mw_quote
|
146
|
+
line = e.last
|
147
|
+
line << "+QUOTE+" if DEBUG_MODE
|
148
|
+
when :mw_unordered, :mw_ordered, :mw_definition
|
149
|
+
next unless config[:list]
|
150
|
+
|
151
|
+
line = e.last
|
152
|
+
line << "+LIST+" if DEBUG_MODE
|
153
|
+
when :mw_ml_template
|
154
|
+
next unless config[:multiline]
|
155
|
+
|
156
|
+
line = e.last
|
157
|
+
line << "+MLTEMPLATE+" if DEBUG_MODE
|
158
|
+
when :mw_redirect
|
159
|
+
next unless config[:redirect]
|
160
|
+
|
161
|
+
line = e.last
|
162
|
+
line << "+REDIRECT+" if DEBUG_MODE
|
163
|
+
line << "\n\n"
|
164
|
+
when :mw_isolated_template
|
165
|
+
next unless config[:multiline]
|
166
|
+
|
167
|
+
line = e.last
|
168
|
+
line << "+ISOLATED_TEMPLATE+" if DEBUG_MODE
|
169
|
+
when :mw_isolated_tag
|
170
|
+
next
|
171
|
+
else
|
172
|
+
next unless DEBUG_MODE
|
173
|
+
|
174
|
+
line = e.last
|
175
|
+
line << "+OTHER+"
|
176
|
+
end
|
177
|
+
contents << line << "\n"
|
182
178
|
end
|
183
179
|
end
|
184
|
-
contents << line << "\n"
|
185
|
-
end
|
186
|
-
end
|
187
180
|
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
181
|
+
if /\A[\s ]*\z/m =~ contents
|
182
|
+
""
|
183
|
+
else
|
184
|
+
config[:title] ? title << contents : contents
|
185
|
+
end
|
186
|
+
end
|
192
187
|
end
|
188
|
+
puts pastel.blue.bold("Complete!")
|
193
189
|
end
|
194
190
|
end
|
195
191
|
|
196
|
-
|
197
|
-
|
192
|
+
WpApp.new.run
|